<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e87803</article-id><article-id pub-id-type="doi">10.2196/87803</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>AI Chatbot Answers for Drug Dosing Adjustments According to Renal Function in Geriatric Patients Using the New Scoring System (AI Quality Output Score): Cross-Sectional Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Barbonus</surname><given-names>Celine</given-names></name><degrees>Dip</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sultzer</surname><given-names>Ralf</given-names></name><degrees>Dr med</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Bertsche</surname><given-names>Thilo</given-names></name><degrees>Prof Dr</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Clinical Pharmacy, Institute of Pharmacy, Faculty of Medicine, Leipzig University</institution><addr-line>Br&#x00FC;derstra&#x00DF;e 32</addr-line><addr-line>Leipzig</addr-line><addr-line>Saxony</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Drug Safety Center, Leipzig University and University of Leipzig Medical Center</institution><addr-line>Leipzig</addr-line><addr-line>Saxony</addr-line><country>Germany</country></aff><aff id="aff3"><institution>Sana Geriatric Centre Zwenkau</institution><addr-line>Zwenkau</addr-line><addr-line>Saxony</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Emam</surname><given-names>Khaled El</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Kuan-Hsun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Singh</surname><given-names>Reenu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Thilo Bertsche, Prof Dr, Department of Clinical Pharmacy, Institute of Pharmacy, Faculty of Medicine, Leipzig University, Br&#x00FC;derstra&#x00DF;e 32, Leipzig, Saxony, 04103, Germany, 49 3419711800; <email>thilo.bertsche@uni-leipzig.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>5</day><month>6</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e87803</elocation-id><history><date date-type="received"><day>14</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>22</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>25</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Celine Barbonus, Ralf Sultzer, Thilo Bertsche. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 5.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e87803"/><abstract><sec><title>Background</title><p>Preventable adverse drug reactions in geriatric patients are caused by overdosing, especially in cases of impaired renal function. Artificial intelligence (AI) chatbots are being discussed as tools to generate drug information, which can adjust drug dosing and prevent subsequent adverse drug reactions based on individualized patient data. However, the question arises as to the extent to which such AI chatbots can withstand scientific evaluation in this task.</p></sec><sec><title>Objective</title><p>We newly developed and validated the AI quality output score (AQUOS, ranging from 0% to 100%) to assess the quality of AI chatbot answers. We investigated whether AQUOS depends on (1) renal function, (2) medication complexity, (3) prompting language (English and German), and (4) whether the answers are reproducible (assessed at 2 independent times). Additionally, we assessed the potential for harm.</p></sec><sec sec-type="methods"><title>Methods</title><p>In a standardized prompt, we asked 4 AI chatbots (ChatGPT, Copilot, Gemini, and Scite) whether the medication of 100 geriatric patients with polymedication at discharge should be adjusted according to their renal function. We prompted drug-related queries in 2 languages and at 2 times to assess AI chatbot answers, and we scored the generated outputs based on AQUOS. Additionally, we assessed possible harm from the AI chatbot answers using the World Health Organization definition &#x201C;The conceptual framework for the international classification for patient safety.&#x201D;</p></sec><sec sec-type="results"><title>Results</title><p>We analyzed 1600 AI chatbot answers, with AQUOS values ranging from &#x2212;19.0% to 95.2%, depending on the chatbot. We found that AQUOS declined with decreasing renal function (ChatGPT: &#x2212;0.215; <italic>P</italic>=.03) and increasing medication complexity (Scite: &#x2212;0.239; <italic>P</italic>=.02). Possible harm also correlated with more complicated patient statuses (lower kidney function and higher medication complexity) across all chatbots. Overall scores were up to 4.8% higher in English than in German prompting. The AI chatbot answers were highly reproducible.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In renal drug dosing, the quality of AI chatbot answers declined as renal function decreased and medication complexity increased. Even the highest AQUOS achieved is insufficient for deploying AI chatbots in the high-risk health care sector.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>large language models</kwd><kwd>LLMs</kwd><kwd>pharmaceutical</kwd><kwd>score</kwd><kwd>decision-making</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Artificial intelligence (AI), and especially AI chatbots, has become important in medical practice, supporting clinical decision-making [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. AI chatbots, AI-based search engines, and other large language models (LLMs) are increasingly studied for their potential to make therapy processes&#x2014;and, in particular, the gathering of drug information&#x2014;faster and more efficient. In contrast to the high number of publications dealing with AI in health care, only a few involve real patient data [<xref ref-type="bibr" rid="ref3">3</xref>]. Earlier studies often focus on limited aspects and lack practical relevance to specific clinical needs, such as renal dosage adjustment [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Adjusting medication based on renal function is challenging, especially for geriatric patients. Adverse drug reactions often result from overdosing, frequently caused by impaired renal function [<xref ref-type="bibr" rid="ref7">7</xref>]. Age-related changes in pharmacokinetic parameters, comorbidities, and polypharmacy require careful, individualized drug therapy, especially in terms of patient safety and the safety of drug therapy [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. However, tailored medication dosages adjusted to renal function should also be readily available under routine conditions to prevent avoidable adverse drug reactions. AI chatbots could have great potential in this context, although it should be noted from a legal perspective that they are not classified as medical devices.</p><p>Until now, no standardized score has existed for a quantitative quality assessment of AI chatbot answers in drug-related queries. To close this gap, we developed a new score to evaluate the quality (AI quality output score [AQUOS]) of AI chatbot answers based on the literature, adapted for drug-related queries [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Rather than relying on casual, everyday phrasing, prompts were structured using prompt engineering to provide consistent input and optimize AI potential [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Because medication queries are not only applicable to geriatric patients and renal function, AQUOS applies to a broader range of drug-related clinical scenarios.</p><p>Our study aimed to evaluate the quality and potential harm of drug-related queries addressing tailored renal dosing. We developed a score assessing quality (AQUOS) to see how it varies with renal function and medication complexity (ie, the number of drugs prescribed). Furthermore, we evaluated 2 prompting languages, as in routine practice, many questions are asked in the native language, and no previous studies have compared multilingual performance in this context. We also tested whether the AI chatbot outputs are reproducible over time. In addition to the score, we assessed potential harm according to the World Health Organization (WHO) definition.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>The cross-sectional, observational study is based on the CHART (Chatbot Assessment Reporting Tool) checklist (<xref ref-type="supplementary-material" rid="app7">Checklist 1</xref>) [<xref ref-type="bibr" rid="ref14">14</xref>].</p><sec id="s2-1"><title>Ethical Considerations</title><p>The ethics committee of the Medical Faculty of Leipzig University (231/24-ek) approved the procedure on July 29, 2024. Due to the retrospective collection of patient data, no informed consent for publication was obtained; therefore, written informed consent was not required. Anonymized prompts were used to ensure patient privacy.</p></sec><sec id="s2-2"><title>Setting</title><p>In 2024, we used GeriDoc from the Geriatrics in Bavaria database to collect retrospective patient data from a geriatric hospital.</p></sec><sec id="s2-3"><title>Patient Data</title><p>We included data from geriatric patients in the rehabilitation ward who were hospitalized in 2023.</p><p>One hundred patients who meet the inclusion criteria must be included. An additional 10 patients were used for the pretesting of the AQUOS, and these patients were excluded from the main analysis.</p></sec><sec id="s2-4"><title>Inclusion Criteria</title><p>Eleven randomly selected patients per month from January 2023 to October 2023 were chosen for this study. To be included, renal function (glomerular filtration rate [GFR]) and polymedication (at least 5 drugs) at discharge had to be documented. In this case, polymedication means the patient must take at least 5 drugs; for example, a combined preparation of 2 drugs counts as 2, not 1.</p></sec><sec id="s2-5"><title>Study Design</title><p>In this study, we compared 4 different AI chatbots and designed structured inputs or prompts. After receiving ethical approval, we generated the AI outputs from October 18 to October 30, 2024, in Leipzig, Germany. The GFR was categorized into the typical 5 stages of renal disease: category 1 as normal renal function, category 2 as slightly reduced, category 3 as moderately reduced, category 4 as severely reduced, and category 5 as renal failure. The inputs were designed in both German and English, and outputs were evaluated at 2 time points: t0 and t1 (8 d later) to test reproducibility (not the learning effect after updates). Thus, t1 serves as the control for t0. We used an AQUOS (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) for evaluation. Additionally, discharge medications were categorized by complexity: low (5&#x2010;9 drugs), medium (10-14 drugs), and high (&#x2265;15 drugs). In this study, medication complexity is defined exclusively as complexity based on the number of medications and is divided into 3 categories (low, medium, and high). This measure reflects structural complexity based on the number of prescribed drugs and does not account for pharmacological risk, therapeutic drug classification, or potential drug-drug interactions.</p></sec><sec id="s2-6"><title>Prompting</title><p>We created a prompt based on prompt engineering, assigning roles to both the AI chatbot and the requester. We specified the renal function as GFR and listed the patient&#x2019;s discharge medications and dosages. Following this, we instructed the chatbot to provide a precise answer if a dosage adjustment was necessary and to cite sources as a concrete task using both German and English prompts. The following prompting structure was developed based on literature [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>] by the 2 authors, CB and TB, who are both pharmacists. We adjusted the prompt based on the quality of the AI chatbot&#x2019;s responses. The aim was to frame the query as a standardized zero-shot prompt without any follow-up prompts. Neither patients nor members of the public were involved in the development process.</p><p>An example prompt in English:</p><disp-quote><p>I am a physician in a hospital. Give your answers from a pharmacist&#x2019;s point of view. It&#x2019;s about a geriatric patient with a GFR of 62 ml/min who is taking the following medication: Acetylsalicylic acid 100 mg once a day, Ramipril 10 mg once a day, Atorvastatin 10 mg once a day, Pantoprazole 20 mg once a day, Metamizole 500 mg if required up to four times a day. Give a precise, short answer whether and how the dosage should be adjusted for the current GFR. Give reliable sources, including links, to your answer.</p></disp-quote><p>The prompt was deliberately restricted to GFR, drug name, dose, and frequency. This standardized, minimal input format was chosen to ensure comparability across all 100 patient cases and all 4 AI chatbots and to reflect a realistic scenario of brief, point-of-care queries as they might occur in routine clinical practice. We acknowledge that clinically valid renal dosage adjustment may additionally depend on variables such as the indication for each drug, route of administration, treatment duration, dialysis status, body weight, or the differentiation of acute and chronic renal impairment. While more complex prompt formats incorporating additional clinical variables could yield more nuanced AI outputs, such designs would compromise standardization and cross-case comparability, which was an essential aspect of this study. The omission of these variables limits the clinical interpretability of the findings and should be considered when applying results to real-world settings.</p><p>If multiple GFR values were included in the discharge letter, the median GFR was used. The drug dosing was provided as in the example, along with the drug name and dosage frequency. We extracted chatbot outputs from the platform and saved them locally, starting a new conversation for each prompt while ensuring previous conversations were cleared. We configured the software to prevent it from &#x201C;remembering&#x201D; previous conversations. If a network error occurred, the output was regenerated. After generating the outputs, we verified that the companies had made no relevant updates to the AI chatbots.</p><p>Considering that the prompts were entered in German and English at 2 different times (t0 and t1) in 4 different AI chatbots each, 16 outputs were generated per patient.</p></sec><sec id="s2-7"><title>AI Chatbots</title><p>When collecting the AI chatbots, we focused on large, well-known AI chatbots. Moreover, appropriate settings had to be available to prevent entered data from being used for future AI model updates.</p><p>In terms of terminology, &#x201C;AI chatbot&#x201D; refers to the conversational interface accessed by users, &#x201C;LLM&#x201D; refers to the underlying language model architecture, and &#x201C;model&#x201D; is used as a general term for naming the different versions.</p><p>We used OpenAI GPT-4 (gpt-4o-2024-11-20, knowledge cutoff at October 01, 2023), Microsoft Copilot Business, Google Gemini 1.5 Flash (gemini-1.5-flash-002), and Research Solutions Scite, all of which are closed sources [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. The exact models of the AI chatbots were not consistently accessible at the time of data collection, but we verified post hoc that no relevant model updates were released by the respective providers during the data collection window. We have accessed the AI chatbots using the web interface of each provider. Furthermore, the standard settings (eg, temperature) were applied to all AI chatbots, which were used as base models as provided by their respective companies. For each patient, a new conversation was initiated by entering the prompt and then saving the AI chatbot&#x2019;s response locally, and the conversation was subsequently deleted before the next case was processed. With this procedure, no carry-over effects between patient cases could occur.</p></sec><sec id="s2-8"><title>Quality Score (AQUOS)</title><p>Based on previous research [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], we implemented an AQUOS (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) to evaluate the outputs of the AI chatbots. The score consists of 9 items. The first 5 items, rated from 0 to 4, focus on completeness, referencing, reference suitability, correctness of dosage recommendations, and dosing accuracy. The sixth item, rated 0 or 1 point, checks for disclaimers and references to health care professionals, patient monitoring, and the individual patient case. The last 3 items allow point deductions for unnecessary additional information, incorrect use of medical terms, and inappropriate language or phrasing, with greater issues leading to greater point reductions. Thus, the highest achievable score was 21, which equals 100%.</p><p>As a reference standard for evaluating the correctness of the AI chatbot outputs, the German database &#x201C;Dosing&#x201D; [<xref ref-type="bibr" rid="ref19">19</xref>], combined with the corresponding summary of product characteristics, was used. In cases of discrepancies, the more recent source was given preference. If discrepancies persisted, clinical guidelines, original publications, or information from the European Medicines Agency were consulted. In <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, some higher quality and poorer examples of AI chatbot outputs are provided.</p></sec><sec id="s2-9"><title>Assessment of Harm</title><p>In addition to AQUOS, the possible harm the chatbot could have caused with its response was assessed. The outputs were evaluated to determine whether the specific output could cause harm if provided to a physician as pharmaceutical advice. The possible harm was ranked using the <italic>Conceptual Framework for the International Classification for Patient Safety</italic> by the WHO Patient Safety [<xref ref-type="bibr" rid="ref20">20</xref>]. The ranking of the harm ranges from &#x201C;none&#x201D; to &#x201C;death,&#x201D; according to the WHO categories: none, mild, moderate, severe, and death (0-5). Since the scaling differs from AQUOS, possible harm was considered separately.</p></sec><sec id="s2-10"><title>Validation Procedure of AQUOS</title><p>The validation of the AQUOS scoring system was conducted in 2 sequential phases, following the principles of internal and external validation as commonly applied in laboratory and clinical assay development.</p><p>This validation design used in this study broadly follows the principles used in the development of clinical scoring systems, where initial reliability testing is followed by a comparison with expert consensus, including an interrater reliability and intraclass correlation coefficient [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. The detailed validation procedure is provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec><sec id="s2-11"><title>Outcomes</title><p>We analyzed the points achieved in AQUOS to evaluate the quality of the AI chatbot outputs. Medication count&#x2013;based complexity (categorized into 3 groups depending on the number of drugs, not the type of drugs or, eg, possible drug-drug interactions) and GFR (categorized according to the typical 5 stages, ranging from normal renal function to renal failure [<xref ref-type="bibr" rid="ref26">26</xref>]) were analyzed in correlation with the score. Additionally, we investigated whether the AQUOS differs between English and German and whether the AI chatbot answers are reproducible over time (t0 and t1).</p></sec><sec id="s2-12"><title>Statistical Methods</title><p>To analyze the data, we conducted descriptive analyses (mean, median, relative difference, and correlations) and a paired, 2-tailed <italic>t</italic> test to assess statistical significance, with <italic>P</italic>&#x003C;.05 considered statistically significant. To evaluate the first validation phase, we performed Cohen &#x03BA; to examine whether the score was objective and to continue with just 1 rater in the study. For the second validation phase of AQUOS, we conducted an intraclass correlation coefficient analysis among the expert opinions, and AQUOS was then correlated with the median of the expert ratings using the Spearman&#x2019;s correlation.</p><p>The statistical analysis was performed using IBM SPSS Statistics version 29 and Excel version 2408 from Microsoft 365.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>The main analysis included 100 geriatric patients, while the pilot study involved 10 geriatric patients. The mean (SD) number of discharge medications was 11.4 (4.2) for the primary group and 11.1 (2.8) for the pilot group. Patients&#x2019; characteristics and the medication count from the main analysis are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p><p>In total, we generated 1600 outputs using the AI chatbots. More precisely, this means that 16 outputs were generated per patient using 4 chatbots (ChatGPT [OpenAI], Copilot [Microsoft], Gemini [Google], and Scite [Research Solutions]), each at 2 different times for the investigation of reproducibility (t0 and t1) and in 2 different languages (German and English). <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> provides examples of how the AI chatbots&#x2019; outputs were evaluated. A study flow diagram illustrating patient selection, the pilot sample, the derivation of the final 100 included patient cases, and the generation of 1600 AI chatbot outputs is provided in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Patient characteristics (N=100; all geriatric patients) and medication count from the main analysis<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Low complexity</td><td align="left" valign="bottom">Medium complexity</td><td align="left" valign="bottom">High complexity</td></tr></thead><tbody><tr><td align="left" valign="top">Number of patients</td><td align="left" valign="top">33</td><td align="left" valign="top">46</td><td align="left" valign="top">21</td></tr><tr><td align="left" valign="top">GFR<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, mean (SD; min-max)</td><td align="left" valign="top">67.9 (18.3; 28.0-109.0)</td><td align="left" valign="top">64.6 (18.1; 19.5-100.5)</td><td align="left" valign="top">61.2 (25.1; 15.0-97.0)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Number of drugs taken in different complexity intervals: low complexity=5&#x2010;9 drugs, medium complexity=10&#x2010;14 drugs, and high complexity=at least 15 drugs. The renal function was measured as GFR.</p></fn><fn id="table1fn2"><p><sup>b</sup>GFR: glomerular filtration rate.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study flow diagram. From 110 randomly selected geriatric patients (11 patients/mo, January 2023-October 2023), 10 were allocated to the pilot sample for artificial intelligence quality output score (AQUOS) validation, and 100 to the main analysis. The 100 patients included in the main analysis were each prompted across 4 AI chatbots, 2 languages, and 2 time points, resulting in 1600 outputs evaluated using AQUOS and World Health Organization (WHO) harm classification. GFR: glomerular filtration rate.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e87803_fig01.png"/></fig></sec><sec id="s3-2"><title>Validation of AQUOS</title><p>As a result of the first validation phase, the score was considered objective with a Cohen &#x03BA; of 0.971, so there was 1 rater, a pharmacist, responsible for scoring. No patients or members of the public were included in the scoring process. For the second score validation phase, the intraclass correlation coefficient of 0.906 (95% CI 0.795&#x2010;0.974; <italic>P</italic>&#x003C;.001) shows excellent agreement between the raters of the expert panel. In addition, the Spearman correlation, with a Spearman &#x03C1; of 0.650 (95% CI 0.012&#x2010;0.912; <italic>P</italic>=.04) between AQUOS and the median of the expert panel, validates AQUOS in an external validation conducted by experts.</p></sec><sec id="s3-3"><title>Renal Function</title><p>There were 5 patients in the normal renal function category, 62 in category 2 (slightly reduced), 25 in category 3 (moderately reduced), 8 in category 4 (severely reduced), and no patients in category 5 (renal failure) based on GFR classification.</p><p>The trends of overall AQUOS in German and English, as shown in <xref ref-type="fig" rid="figure2">Figure 2</xref> and <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>, decline with worsening renal function (ChatGPT: &#x2013;0.215, <italic>P</italic>=.03; Copilot: &#x2013;0.258, <italic>P</italic>=.01; Scite: &#x2013;0.357, <italic>P</italic>&#x003C;.01; <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). Regarding the English outputs, Copilot and Gemini had higher overall mean (SD) scores from category 3 (69.3%, 16.0% and 41.9%, 60.0%) to category 4 (71.4%, 9.4% and 45.8%, 31.4%). The maximum overall mean AQUOS was reached by ChatGPT (81.0%, SD 18.6%). Gemini had the lowest overall mean AQUOS of 41.9% (SD 60.0%). ChatGPT reached the highest single overall AQUOS with 95.2%, while Gemini achieved the lowest overall score in English (&#x2212;19.0%).</p><p>Gemini caused possible mild harm in category 2 (mean 0.6, SD 0.9). In category 3, mild harm was noted in Copilot (mean 0.5, SD 0.7), Gemini (mean 0.7, SD 0.7), and Scite (mean 0.5, SD 0.7). Category 4 indicated overall possible mild harm across all chatbots. Significant correlations between possible harm and GFR categories were found: ChatGPT (0.396; <italic>P</italic>&#x003C;.001), Copilot (0.476; <italic>P</italic>&#x003C;.001), and Scite (0.443; <italic>P</italic>&#x003C;.001).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Mean overall scores (artificial intelligence quality output score [AQUOS]) of 4 AI chatbots based on glomerular filtration rate (GFR) categories (A, B) and the complexity of medication (C, D) in both German and English. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e87803_fig02.png"/></fig></sec><sec id="s3-4"><title>Medication Count&#x2013;Based Complexity</title><p>In the low-complexity category, 33 patients were included; in the medium-complexity category, 46 patients were included; and in the high-complexity category, 21 patients were included.</p><p>The trends of overall AQUOS in German and English are shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>, with specific values provided in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. The overall AQUOS declines as the complexity increases (Scite: &#x2212;0.239, <italic>P</italic>=.02; <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). Regarding the English outputs, the only exception is Copilot, where AQUOS increases from medium to high complexity, but not more than the score in low complexity. The maximum overall mean AQUOS was achieved by ChatGPT (79.1%, SD 9.0%). The lowest overall mean AQUOS was observed Gemini (43.3%, SD 25.5%). In low complexity, ChatGPT reached the highest single overall AQUOS (95.2%), while Gemini reached the lowest single overall score (&#x2212;19.0%).</p><p>Gemini caused possible harm in medium complexity, with a mean of 0.9 (SD 1.0), corresponding to mild harm. In high complexity, possible harm was mild and was caused by Gemini (mean 0.6, SD 1.0). There was a significant correlation coefficient between the possible harm and the complexity only in German: ChatGPT (0.219; <italic>P</italic>=.03), Copilot (0.226; <italic>P</italic>=.02), and Scite (0.237<italic>; P</italic>=.02).</p></sec><sec id="s3-5"><title>Prompting Language</title><p>For every chatbot, the overall AQUOS were higher when the prompt was in English than in German (<xref ref-type="fig" rid="figure3">Figure 3</xref>). The only exception was Copilot, whose overall AQUOS scores were the same (median 71.4% [IQR 13.3%] in German and 71.4% [IQR 6.7%] in English) in both languages. ChatGPT achieved the highest scores in English (median 76.2% [IQR 12.5%]), while Gemini reached the lowest scores in German (median 42.9% [IQR 33.3%]). There was no possible harm in the median; however, regarding the IQR of Gemini in both languages, there was mild harm.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Comparison of the prompting languages, German and English, and 4 different AI chatbots (ChatGPT, Copilot, Gemini, and Scite), showing median overall scores (IQR) [%].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e87803_fig03.png"/></fig></sec><sec id="s3-6"><title>Reproducibility</title><p>There was no significant difference between t0 and t1 regarding the overall AQUOS (<xref ref-type="table" rid="table2">Table 2</xref>). However, some single-score criteria showed significant differences.</p><p>In low complexity, there was a significant difference at Gemini in English regarding AQUOS criterion 6 (referral to medical or pharmaceutical professionals or follow-up checks), with a relative difference of 15.2% (<italic>P</italic>=.02). In medium complexity, there was a significant difference at ChatGPT in German for criterion 3 (suitability of the references given), with a relative difference of 5.4% (<italic>P</italic>=.01). Other significant differences were observed at Gemini, all in English, for criterion 4 (correct dose recommendation) and possible harm, with corresponding relative differences of 16.9% (<italic>P</italic>=.004) and 12.5% (<italic>P</italic>=.001).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Reproducibility for all overall artificial intelligence quality output scores (AQUOS; t0 vs t1) and relative differences (<italic>P</italic> value)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Complexity</td><td align="left" valign="bottom" colspan="2">ChatGPT</td><td align="left" valign="bottom" colspan="2">Copilot</td><td align="left" valign="bottom" colspan="2">Gemini</td><td align="left" valign="bottom" colspan="2">Scite</td></tr><tr><td align="left" valign="top">German</td><td align="left" valign="top">English</td><td align="left" valign="top">German</td><td align="left" valign="top">English</td><td align="left" valign="top">German</td><td align="left" valign="top">English</td><td align="left" valign="top">German</td><td align="left" valign="top">English</td></tr></thead><tbody><tr><td align="left" valign="top">Low complexity (%)</td><td align="left" valign="top">2.50 (<italic>P</italic>=.94)</td><td align="left" valign="top">1.89 (<italic>P</italic>=.17)</td><td align="left" valign="top">2.95 (<italic>P</italic>=.08)</td><td align="left" valign="top">2.42 (<italic>P</italic>=.51)</td><td align="left" valign="top">2.58 (<italic>P</italic>=.12)</td><td align="left" valign="top">5.33 (<italic>P</italic>=.72)</td><td align="left" valign="top">2.42 (<italic>P</italic>=.16)</td><td align="left" valign="top">1.52 (<italic>P</italic>=.34)</td></tr><tr><td align="left" valign="top">Medium complexity (%)</td><td align="left" valign="top">2.66 (<italic>P</italic>=.08)</td><td align="left" valign="top">2.23 (<italic>P</italic>=.66)</td><td align="left" valign="top">1.47 (<italic>P</italic>=.41)</td><td align="left" valign="top">2.07 (<italic>P</italic>=.94)</td><td align="left" valign="top">1.20 (<italic>P</italic>=.52)</td><td align="left" valign="top">6.03 (<italic>P</italic>=.42)</td><td align="left" valign="top">1.74 (<italic>P</italic>=.25)</td><td align="left" valign="top">0.82 (<italic>P</italic>=.66)</td></tr><tr><td align="left" valign="top">High complexity (%)</td><td align="left" valign="top">4.40 (<italic>P</italic>=.41)</td><td align="left" valign="top">2.98 (<italic>P</italic>=.23)</td><td align="left" valign="top">4.29 (<italic>P</italic>=.86)</td><td align="left" valign="top">1.43 (<italic>P</italic>=.43)</td><td align="left" valign="top">2.74 (<italic>P</italic>=.69)</td><td align="left" valign="top">4.29 (<italic>P</italic>=.68)</td><td align="left" valign="top">2.62 (<italic>P</italic>=.75)</td><td align="left" valign="top">2.14 (<italic>P</italic>=.37)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Number of drugs taken in different complexity intervals: low complexity=5&#x2010;9 drugs; medium complexity=10&#x2010;14 drugs; high complexity=at least 15 drugs, and the 4 different chatbots (ChatGPT, Copilot, Gemini, and Scite).</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Key Findings</title><sec id="s4-1-1"><title>Overview</title><p>To evaluate AI chatbot outputs in drug-related queries, we developed the AQUOS (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Using this score, we wanted to find out (1) how it changes depending on renal function, (2) the complexity of the medication taken by the patients, (3) the influence of the language used for prompting, and (4) whether the outputs are reproducible over time.</p><p>AQUOS declined with decreasing renal function and increasing medication count&#x2013;based complexity, with possible harm correlating accordingly. Overall scores were higher in English than in German prompting, and AI chatbot answers were highly reproducible across both time points.</p></sec><sec id="s4-1-2"><title>Renal Function</title><p>The poorer overall AQUOS with worsening renal function, alongside the correlation between lower GFR and increased potential harm, is consistent with findings by van Nuland et al [<xref ref-type="bibr" rid="ref27">27</xref>], who similarly reported poor ChatGPT performance in patients with renal dysfunction. Contrary to this study, we found that GPT-4 reaches mean scores, and the single-score criteria correctness and accuracy (criteria 4 and 5) are always higher than 70% in English. So, even regarding the difficult comparison because of different methods, the quality of ChatGPT was better in our study. However, we also used a newer AI chatbot that could be better and more precise regarding renal function and dosage adjustments. But the pattern of AI chatbots performing poorly in queries from patients with renal dysfunction reflects the greater clinical complexity associated with this condition, which current AI chatbots appear less trained to handle. This may be due to under-representation of such patient populations in training data.</p></sec><sec id="s4-1-3"><title>Medication Count&#x2013;Based Complexity</title><p>AQUOS generally decreases with increasing medication count&#x2013;based complexity, and patient safety was most favorable in the low-complexity group. Splitting complex medication lists into separate queries could be a practical interim approach, though this would preclude the assessment of drug-drug interactions and requires evaluation in future studies. This measure could solve this issue, together with more precise and advanced training of AI chatbots in drug-related queries, but it needs to be validated in further studies. However, in this design, the chatbot&#x2019;s answers are more clinically useful and safer for the patient if the complexity is lower.</p><p>Roosan et al [<xref ref-type="bibr" rid="ref6">6</xref>] investigated whether GPT-4 could solve patient cases of different complexity&#x2014;here defined as difficulty&#x2014;accurately in terms of drug interactions, the precision of recommendations and alternatives, and the adequacy of the created medication plans. They did not prompt in just 1 input but used a new prompt for each of the 3 key aspects. All 39 patient cases were solved correctly, with ChatGPT required to reach a threshold of 70% to be rated as correct.</p></sec><sec id="s4-1-4"><title>Prompting Language</title><p>Because of the higher overall scores in almost all AI chatbots in English, likely due to the majority of training data being in English, the possible development of, for example, a medicinal product from an AI chatbot in the future suggests that the query should be in English, or it would be an option to translate the query. However, at this point, further research is necessary to determine the quality of the AI chatbot outputs when we instruct the chatbot to translate the prompt into English and then answer the query.</p><p>Jin et al [<xref ref-type="bibr" rid="ref28">28</xref>] tested different languages (English, Hindi, Chinese, and Spanish) in LLMs and found that the best answers were given in English. However, they did not investigate German in this study. Schlicht et al [<xref ref-type="bibr" rid="ref29">29</xref>] also examined German (in addition to English, Turkish, and Chinese) and tested GPT-4, among others, as we did. They found that there is quite a bit of variability between the languages and the associated outputs&#x2014;for example, in output length or the consistency of information in the output of different languages.</p><p>Beyond quality differences, the observed language advantages in English raise important equity concerns. Many clinicians worldwide practice in non&#x2013;English-speaking environments and will naturally query AI chatbots in their native language. If responses in languages other than English are generally of lower quality, this may compromise safe and effective use in these settings. As an interim solution, non-English queries could be translated during the query process into English, or validated translation tools could be used before prompts are entered, though the quality implications of such translated inputs require further evaluation. More generally, the English-centric focus of current training data for LLMs carries the risk of further worsening global inequalities in health care; therefore, institutions or regions with limited English knowledge may benefit less from AI tools in clinical environments and may be exposed to a higher risk. In the future development of AI tools, like AI chatbots, especially for clinical use, multilingualism should be explicitly considered as a key quality and safety criterion.</p></sec><sec id="s4-1-5"><title>Reproducibility</title><p>Our good overall reproducibility across both time points differs from the results of Morath et al [<xref ref-type="bibr" rid="ref5">5</xref>], who took 3 inputs at 4 different times. In their study, 3 out of 12 were the same over time, and no objective statistics were made. Furthermore, Al-Dujaili et al [<xref ref-type="bibr" rid="ref18">18</xref>] tested 20 fictive patient cases using GPT-3.5, over 3 time points with a Cohen &#x03BA;, finding a moderate positive agreement overall, which is in line with our findings. However, a comparison of their data to our findings is not valid, because we had 100 double-inserted inputs, used 4 different chatbots (including a different and more recent ChatGPT version), and analyzed the outputs using a scoring system, comparing the scores statistically with relative differences.</p></sec></sec><sec id="s4-2"><title>AI Quality Output Score</title><p>In this study, we tested our AQUOS for the first time, and to our knowledge, this is the first score that rates AI outputs in drug-related queries with different aspects summed up. The following aspects have been investigated individually: completeness [<xref ref-type="bibr" rid="ref4">4</xref>], references [<xref ref-type="bibr" rid="ref4">4</xref>], correctness [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], accuracy [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], and possible harm to patients [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. We designed AQUOS to be applied to other drug-related queries, not just renal dosage adjustment. As a few criteria have been similarly used in the studies mentioned above, there is a basis for external validity. Nevertheless, it should be tested in other areas, studies, and languages.</p><p>Although AQUOS has some weaknesses, such as the AI chatbots regularly losing points for not giving references at all&#x2014;despite the prompt instructing them to do so&#x2014;and for not providing good or accessible references, as long as the content and the recommendations are correct and accurate, there would typically be no harm to the patient. Therefore, it may be necessary to adjust AQUOS depending on the research question.</p></sec><sec id="s4-3"><title>Risks of AI-Generated Clinical Information</title><p>AI chatbots generate fluent and professionally worded responses that may appear authoritative regardless of their actual accuracy or completeness. Users may, therefore, treat chatbot outputs as expert recommendations rather than as an informational starting point that requires critical evaluation. A related issue is automation bias, in which health care professionals may have overconfidence in the AI-generated outputs due to their confident tone, which can lead to dosage decisions being made without adequate verification.</p><p>The recurring question of accountability for such clinical decisions remains largely unresolved. Regulatory frameworks addressing AI in high-risk settings are increasingly emerging internationally, with the European Artificial Intelligence Act [<xref ref-type="bibr" rid="ref30">30</xref>] being a well-known example that explicitly classifies certain AI applications in health care as high risk and imposes requirements for transparency and human oversight. However, implementation remains variable across countries and health care systems, and the specific allocation of accountability in cases where AI-generated recommendations result in patient harm has not yet been adequately addressed in practice.</p><p>These considerations support the conclusion that AI chatbots should be positioned as informational assistance, with the understanding that final clinical responsibility remains with the responsible health care professional, particularly in complex scenarios such as renal dosage adjustment in geriatric patients.</p></sec><sec id="s4-4"><title>Clinical Role of AI Chatbots</title><p>The findings of this study should be interpreted within the context of a clearly defined scope of application. The AI chatbots evaluated here serve as informational aids and should not be mistaken for clinical decision support systems in the regulatory sense or as defined by medical device regulations. Appropriate use scenarios include support for educational purposes, preliminary orientation for clinicians, or self-checking in low-risk, nonurgent cases. AI chatbots are not suitable for directly guiding prescribing decisions, replacing pharmacist consultations, or operating without subsequent verification by a qualified health care professional. This distinction is particularly critical in high-risk settings, such as renal dose adjustment in geriatric patients, where errors can directly harm patients. Any integration of AI chatbot outputs into clinical workflows must, therefore, be accompanied by explicit precautions that ensure the final clinical decision remains with the responsible health care professional.</p></sec><sec id="s4-5"><title>Limitations</title><p>This study has some limitations. First, we used a uniform input structure, varying the GFR values and drug dosages. This approach ensured consistency across the cases, though it does not capture the potential variability that may arise from alternative prompt formulations or interaction types. While we did not incorporate few-shot examples, follow-up dialogs, adjustment settings such as temperature or system instructions, or the usage of the web interface instead of an application programming interface&#x2014;which reduces reproducibility&#x2014;our method was intended to reflect a common practical usage scenario. Since the rater was not blinded during scoring, the possibility of bias cannot be ruled out.</p><p>The patient data were from a single hospital, and we focused on the discharge medication. Our analysis concentrated specifically on the GFR and related dosage adjustments, which are clinically relevant, though other aspects of the medication could also be considered in future research.</p><p>These methodological choices, while deliberate, mean that the findings should be interpreted within the context of the study design. Future studies could expand the range of clinical variables, use diverse data sources, and explore different interaction modes and types to further evaluate chatbot performance.</p></sec><sec id="s4-6"><title>Comparison With Prior Work</title><p>Besides pharmaceutical or drug-related queries, there is an increasing number of scientific studies evaluating medical chatbots in health care.</p><p>Huo et al [<xref ref-type="bibr" rid="ref31">31</xref>] conducted a systematic review to examine the aspects investigated in studies evaluating AI chatbots in health care contexts. They found that almost two thirds of the 137 included studies evaluated AI chatbots using subjective parameters only. They also noted a lack of detailed descriptions of prompt engineering, as well as insufficient consideration of patient safety, regulatory issues, and ethical considerations. Our study directly addresses several of the gaps identified by Huo et al [<xref ref-type="bibr" rid="ref31">31</xref>], including the need for objective assessment and methodological transparency, such as in prompt design and safety or harm assessment. Complementary to this, other research has focused on evaluating conversational AI chatbots with particular attention to their limitations and associated concerns [<xref ref-type="bibr" rid="ref32">32</xref>]. Wang et al [<xref ref-type="bibr" rid="ref32">32</xref>] included 65 studies in their systematic review in which AI chatbots like ChatGPT were applied in health care contexts. Nearly half of the reviewed studies examined medical knowledge inquiries and reported a rather high precision in the AI chatbots&#x2019; responses. In 85% of all papers analyzed, concerns such as the reliability and bias of the AI chatbots were raised [<xref ref-type="bibr" rid="ref32">32</xref>]. Our findings align with these results, as our evaluation using AQUOS and, for example, the study design with 2 different prompting times also places a strong emphasis on reliability.</p><p>Li et al [<xref ref-type="bibr" rid="ref33">33</xref>] compared 8 different AI chatbots on 48 clinical questions using a combination of an expert panel and a 6D evaluation framework. They found that all the chatbots showed limitations and weaknesses in complex cases but also highlighted the potential of AI chatbots, always considering the associated risks [<xref ref-type="bibr" rid="ref33">33</xref>]. Similarly, our study revealed such limitations, especially in more complex cases, such as those involving reduced renal function or multiple concomitant medications. Additionally, another review found that LLMs can support clinicians in various tasks but are not yet reliably applicable across all clinical domains, making deployment challenging [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec><sec id="s4-7"><title>Conclusions</title><p>We developed and validated a new score (AQUOS) to examine the quality of AI chatbot answers for dose adjustment in patients according to their renal function. Using real patient data from a geriatric setting, we found that the quality of the AI chatbots&#x2019; responses varied depending on the chatbot, with values ranging from &#x2212;19.0% to 95.2%. First, these responses were language-dependent. This is relevant because, in inpatient and outpatient settings, information is often sought in the patient&#x2019;s native language. Furthermore, the results were highly reproducible and did not differ significantly at 2 points in time with independent search queries. However, it was found that the quality of the queries and the potential harm were adversely affected when kidney function decreased and the number of medications increased. In general, however, the quality of the responses is not yet convincing, especially in complex situations where advice from a chatbot is particularly sought after. ChatGPT proved to be the best chatbot in terms of quality. Future studies should evaluate AI chatbot performance using broader, more individualized clinical contexts to assess and compare directly whether additional information about the patients improves the accuracy and safety of AI-generated dosing recommendations. Further research should also be encouraged to find ways to influence the results obtained to make them suitable for practical use. In addition, ethical and legal issues surrounding the use of chatbots with real patients in treatment routines still need to be clarified in the future.</p></sec></sec></body><back><ack><p>The authors thank the staff of the Sana Geriatric Centre Zwenkau, especially the physicians working in the rehabilitation ward, for their support. The authors also thank the pharmacists who evaluated the chatbot's answers for validation purposes. The authors used DeepL (DeepL SE, Cologne) to translate single parts of this manuscript from German into English, and the translations were subsequently reviewed and revised by the authors.</p></ack><notes><sec><title>Funding</title><p>This study was supported by the Open Access Publishing Fund of Leipzig University.</p></sec><sec><title>Data Availability</title><p>The datasets generated and analyzed during this study are available from the corresponding author upon reasonable request, in accordance with applicable German data protection legislation and the associated ethical approval.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: CB, TB</p><p>Data curation: CB</p><p>Formal analysis: CB</p><p>Funding acquisition: RS, TB</p><p>Investigation: CB</p><p>Methodology: CB, TB</p><p>Project administration: CB, RS, TB</p><p>Resources: RS, TB</p><p>Supervision: TB</p><p>Validation: CB, TB</p><p>Visualization: CB, TB</p><p>Writing&#x2014;original draft: CB, TB</p><p>Writing&#x2014;review and editing: CB, RS, TB</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AQUOS</term><def><p> artificial intelligence quality output score</p></def></def-item><def-item><term id="abb3">CHART </term><def><p>Chatbot Assessment Reporting Tool</p></def></def-item><def-item><term id="abb4">GFR </term><def><p>glomerular filtration rate</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">WHO</term><def><p> World Health Organization</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>22</day><volume>25</volume><fpage>e48659</fpage><pub-id pub-id-type="doi">10.2196/48659</pub-id><pub-id pub-id-type="medline">37606976</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lie</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Proactive polypharmacy management using large language models: opportunities to enhance geriatric care</article-title><source>J Med Syst</source><year>2024</year><month>04</month><day>18</day><volume>48</volume><issue>1</issue><fpage>41</fpage><pub-id pub-id-type="doi">10.1007/s10916-024-02058-y</pub-id><pub-id pub-id-type="medline">38632172</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Andrikyan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Sametinger</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Kosfeld</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Artificial intelligence-powered chatbots in search engines: a cross-sectional study on the quality and risks of drug information for patients</article-title><source>BMJ Qual Saf</source><year>2025</year><month>02</month><volume>34</volume><issue>2</issue><fpage>100</fpage><lpage>109</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2024-017476</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morath</surname><given-names>B</given-names> </name><name name-style="western"><surname>Chiriac</surname><given-names>U</given-names> </name><name name-style="western"><surname>Jaszkowski</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Performance and risks of ChatGPT used in drug information: an exploratory real-world analysis</article-title><source>Eur J Hosp Pharm</source><year>2024</year><month>10</month><day>25</day><volume>31</volume><issue>6</issue><fpage>491</fpage><lpage>497</lpage><pub-id pub-id-type="doi">10.1136/ejhpharm-2023-003750</pub-id><pub-id pub-id-type="medline">37263772</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roosan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Padua</surname><given-names>P</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Verzosa</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name></person-group><article-title>Effectiveness of ChatGPT in clinical pharmacy and the role of artificial intelligence in medication therapy management</article-title><source>J Am Pharm Assoc (2003)</source><year>2024</year><month>03</month><volume>64</volume><issue>2</issue><fpage>422</fpage><lpage>428</lpage><pub-id pub-id-type="doi">10.1016/j.japh.2023.11.023</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ponticelli</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sala</surname><given-names>G</given-names> </name><name name-style="western"><surname>Glassock</surname><given-names>RJ</given-names> </name></person-group><article-title>Drug management in the elderly adult with chronic kidney disease: a review for the primary care physician</article-title><source>Mayo Clin Proc</source><year>2015</year><month>05</month><volume>90</volume><issue>5</issue><fpage>633</fpage><lpage>645</lpage><pub-id pub-id-type="doi">10.1016/j.mayocp.2015.01.016</pub-id><pub-id pub-id-type="medline">25771152</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mangoni</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Jackson</surname><given-names>SHD</given-names> </name></person-group><article-title>Age-related changes in pharmacokinetics and pharmacodynamics: basic principles and practical applications</article-title><source>Br J Clin Pharmacol</source><year>2004</year><month>01</month><volume>57</volume><issue>1</issue><fpage>6</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1046/j.1365-2125.2003.02007.x</pub-id><pub-id pub-id-type="medline">14678335</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Halli-Tierney</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Scarbrough</surname><given-names>C</given-names> </name><name name-style="western"><surname>Carroll</surname><given-names>D</given-names> </name></person-group><article-title>Polypharmacy: evaluating risks and deprescribing</article-title><source>Am Fam Physician</source><year>2019</year><month>07</month><day>1</day><volume>100</volume><issue>1</issue><fpage>32</fpage><lpage>38</lpage><pub-id pub-id-type="medline">31259501</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Albogami</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Alfakhri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alaqil</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Safety and quality of AI chatbots for drug-related inquiries: a real-world comparison with licensed pharmacists</article-title><source>Digit Health</source><year>2024</year><volume>10</volume><fpage>20552076241253523</fpage><pub-id pub-id-type="doi">10.1177/20552076241253523</pub-id><pub-id pub-id-type="medline">38757086</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Goodman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Patrinely</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the accuracy and reliability of AI-generated medical responses: an evaluation of the ChatGPT model</article-title><source>Res Sq</source><year>2023</year><month>02</month><day>28</day><fpage>rs.3.rs-2566942</fpage><pub-id pub-id-type="doi">10.21203/rs.3.rs-2566942/v1</pub-id><pub-id pub-id-type="medline">36909565</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heston</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Khun</surname><given-names>C</given-names> </name></person-group><article-title>Prompt engineering in medical education</article-title><source>Int Med Educ</source><year>2023</year><volume>2</volume><issue>3</issue><fpage>198</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.3390/ime2030019</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>White</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Hays</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A prompt pattern catalog to enhance prompt engineering with ChatGPT</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 21, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.11382</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>The CHART Collaborative</collab></person-group><article-title>Reporting guidelines for chatbot health advice studies: explanation and elaboration for the Chatbot Assessment Reporting Tool (CHART)</article-title><source>BMJ</source><year>2025</year><volume>390</volume><fpage>e083305</fpage><pub-id pub-id-type="doi">10.1136/bmj-2024-083305</pub-id><pub-id pub-id-type="medline">40750271</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>GPT-4o</article-title><source>OpenAI Developers</source><access-date>2026-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://developers.openai.com/api/docs/models/gpt-4o">https://developers.openai.com/api/docs/models/gpt-4o</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Model versions and lifecycle [Web page in German]</article-title><source>Google Cloud</source><access-date>2026-05-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.cloud.google.com/vertex-ai/generative-ai/docs/learn/model-versions?hl=de#retired-models">https://docs.cloud.google.com/vertex-ai/generative-ai/docs/learn/model-versions?hl=de#retired-models</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alexa</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Richter</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bertsche</surname><given-names>T</given-names> </name></person-group><article-title>Enhancing evidence-based pharmacy by comparing the quality of web-based information sources to the EVInews database: randomized controlled trial with German community pharmacists</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>21</day><volume>25</volume><fpage>e45582</fpage><pub-id pub-id-type="doi">10.2196/45582</pub-id><pub-id pub-id-type="medline">37342085</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al-Dujaili</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Omari</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pillai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Al Faraj</surname><given-names>A</given-names> </name></person-group><article-title>Assessing the accuracy and consistency of ChatGPT in clinical pharmacy management: a preliminary analysis with clinical pharmacy experts worldwide</article-title><source>Res Social Adm Pharm</source><year>2023</year><month>12</month><volume>19</volume><issue>12</issue><fpage>1590</fpage><lpage>1594</lpage><pub-id pub-id-type="doi">10.1016/j.sapharm.2023.08.012</pub-id><pub-id pub-id-type="medline">37696742</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Informationen zur korrekten und sicheren arzneimittel-anwendung [article in german]</article-title><source>DOSING</source><access-date>2026-05-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.dosing.de/">https://www.dosing.de/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="report"><person-group person-group-type="author"><collab>WHO Patient Safety</collab></person-group><article-title>Conceptual framework for the international classification for patient safety</article-title><year>2010</year><access-date>2025-07-28</access-date><publisher-name>World Health Organization</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/publications/i/item/WHO-IER-PSP-2010.2">https://www.who.int/publications/i/item/WHO-IER-PSP-2010.2</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maes</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Tremp</surname><given-names>RM</given-names> </name><collab>GSASA Working group on clinical pharmacy</collab><name name-style="western"><surname>Hersberger</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Lampert</surname><given-names>ML</given-names> </name></person-group><article-title>Demonstrating the clinical pharmacist&#x2019;s activity: validation of an intervention oriented classification system</article-title><source>Int J Clin Pharm</source><year>2015</year><month>12</month><volume>37</volume><issue>6</issue><fpage>1162</fpage><lpage>1171</lpage><pub-id pub-id-type="doi">10.1007/s11096-015-0179-z</pub-id><pub-id pub-id-type="medline">26290379</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alsayed</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Al-Dulaimi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alnatour</surname><given-names>D</given-names> </name><name name-style="western"><surname>Awajan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Alshammari</surname><given-names>B</given-names> </name></person-group><article-title>Validation of an assessment, medical problem-oriented plan, and care plan tools for demonstrating the clinical pharmacist&#x2019;s activities</article-title><source>Saudi Pharm J</source><year>2022</year><month>10</month><volume>30</volume><issue>10</issue><fpage>1464</fpage><lpage>1472</lpage><pub-id pub-id-type="doi">10.1016/j.jsps.2022.07.007</pub-id><pub-id pub-id-type="medline">36387343</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hyrk&#x00E4;s</surname><given-names>K</given-names> </name><name name-style="western"><surname>Appelqvist-Schmidlechner</surname><given-names>K</given-names> </name><name name-style="western"><surname>Oksa</surname><given-names>L</given-names> </name></person-group><article-title>Validating an instrument for clinical supervision using an expert panel</article-title><source>Int J Nurs Stud</source><year>2003</year><month>08</month><volume>40</volume><issue>6</issue><fpage>619</fpage><lpage>625</lpage><pub-id pub-id-type="doi">10.1016/s0020-7489(03)00036-1</pub-id><pub-id pub-id-type="medline">12834927</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eckardt</surname><given-names>KU</given-names> </name><name name-style="western"><surname>Binet</surname><given-names>I</given-names> </name><name name-style="western"><surname>de Groot</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Nomenklatur f&#x00FC;r Nierenfunktion und Nierenkrankheiten &#x2013; Durch Pr&#x00E4;zision und Verst&#x00E4;ndlichkeit zu besserer Erfassung und Prognose [Article in German]</article-title><source>Dtsch Med Wochenschr</source><year>2022</year><month>10</month><volume>147</volume><issue>21</issue><fpage>1398</fpage><lpage>1406</lpage><pub-id pub-id-type="doi">10.1055/a-1908-5163</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Nuland</surname><given-names>M</given-names> </name><name name-style="western"><surname>Snoep</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Egberts</surname><given-names>T</given-names> </name><name name-style="western"><surname>Erdogan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wassink</surname><given-names>R</given-names> </name><name name-style="western"><surname>van der Linden</surname><given-names>PD</given-names> </name></person-group><article-title>Poor performance of ChatGPT in clinical rule-guided dose interventions in hospitalized patients with renal dysfunction</article-title><source>Eur J Clin Pharmacol</source><year>2024</year><month>08</month><volume>80</volume><issue>8</issue><fpage>1133</fpage><lpage>1140</lpage><pub-id pub-id-type="doi">10.1007/s00228-024-03687-5</pub-id><pub-id pub-id-type="medline">38592470</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chandra</surname><given-names>M</given-names> </name><name name-style="western"><surname>Verma</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>De Choudhury</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name></person-group><article-title>Better to ask in English: cross-lingual evaluation of large language models for healthcare queries</article-title><conf-name>ACM Web Conference 2024 (WWW &#x2019;24)</conf-name><conf-date>May 13-17, 2024</conf-date><pub-id pub-id-type="doi">10.1145/3589334.3645643</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Schlicht</surname><given-names>IB</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Sayin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Flek</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rosso</surname><given-names>P</given-names> </name></person-group><article-title>Do LLMs provide consistent answers to health-related questions across languages?</article-title><conf-name>47th European Conference on Information Retrieval, ECIR 2025</conf-name><conf-date>Apr 6-10, 2025</conf-date><pub-id pub-id-type="doi">10.1007/978-3-031-88714-7_30</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><source>EU Artificial Intelligence Act</source><access-date>2026-05-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://artificialintelligenceact.eu/">https://artificialintelligenceact.eu/</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Boyle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Marfo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Large language models for chatbot health advice studies: a systematic review</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2457879</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.57879</pub-id><pub-id pub-id-type="medline">39903463</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ni</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Applications and concerns of ChatGPT and other conversational large language models in health care: systematic review</article-title><source>J Med Internet Res</source><year>2024</year><month>11</month><day>7</day><volume>26</volume><fpage>e22769</fpage><pub-id pub-id-type="doi">10.2196/22769</pub-id><pub-id pub-id-type="medline">39509695</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Du</surname><given-names>P</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Comparative analysis of generative artificial intelligence systems in solving clinical pharmacy problems: mixed methods study</article-title><source>JMIR Med Inform</source><year>2025</year><month>07</month><day>24</day><volume>13</volume><fpage>e76128</fpage><pub-id pub-id-type="doi">10.2196/76128</pub-id><pub-id pub-id-type="medline">40705654</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Python</surname><given-names>A</given-names> </name></person-group><article-title>Implementing large language models in health care: clinician-focused review with interactive guideline</article-title><source>J Med Internet Res</source><year>2025</year><volume>27</volume><fpage>e71916</fpage><pub-id pub-id-type="doi">10.2196/71916</pub-id><pub-id pub-id-type="medline">40644686</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The artificial intelligence quality output score (AQUOS).</p><media xlink:href="ai_v5i1e87803_app1.doc" xlink:title="DOC File, 42 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Artificial intelligence (AI) chatbot outputs and scoring examples.</p><media xlink:href="ai_v5i1e87803_app2.doc" xlink:title="DOC File, 33 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Validation procedure of artificial intelligence quality output score (AQUOS).</p><media xlink:href="ai_v5i1e87803_app3.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Overall output scores (artificial intelligence quality output score [AQUOS]) of each artificial intelligence (AI) chatbot in German and English.</p><media xlink:href="ai_v5i1e87803_app4.doc" xlink:title="DOC File, 33 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Correlation of the overall output scores (artificial intelligence quality output score [AQUOS]) of each artificial intelligence (AI) chatbot with glomerular filtration rate (GFR) categories and complexity categories in German and English, correlation coefficient (<italic>P</italic> value, double-sided).</p><media xlink:href="ai_v5i1e87803_app5.doc" xlink:title="DOC File, 31 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Overall output scores (artificial intelligence quality output score [AQUOS]) of each artificial intelligence (AI) chatbot in German and English.</p><media xlink:href="ai_v5i1e87803_app6.doc" xlink:title="DOC File, 32 KB"/></supplementary-material><supplementary-material id="app7"><label>Checklist 1</label><p>CHART checklist.</p><media xlink:href="ai_v5i1e87803_app7.pdf" xlink:title="PDF File, 382 KB"/></supplementary-material></app-group></back></article>