<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e72101</article-id><article-id pub-id-type="doi">10.2196/72101</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of a Small Language Model Versus a Large Language Model in Answering Glaucoma Frequently Asked Patient Questions: Development and Usability Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Faneli</surname><given-names>Adriano Cypriano</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Scherer</surname><given-names>Rafael</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Muralidhar</surname><given-names>Rohit</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Guerreiro-Filho</surname><given-names>Marcus</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Beniz</surname><given-names>Luiz</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Vilasboas-Campos</surname><given-names>Ver&#x00F4;nica</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Costa</surname><given-names>Douglas</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jammal</surname><given-names>Alessandro A</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Medeiros</surname><given-names>Felipe A</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Bascom Palmer Eye Institute, University of Miami</institution><addr-line>900 NW 17th St</addr-line><addr-line>Miami</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Ophthalmology, Federal University of S&#x00E3;o Paulo</institution><addr-line>S&#x00E3;o Paulo</addr-line><country>Brazil</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Emam</surname><given-names>Khaled El</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Jafarizadeh</surname><given-names>Ali</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Souza</surname><given-names>Joshua De</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yoo</surname><given-names>Tae Keun</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Felipe A Medeiros, MD, PhD, Bascom Palmer Eye Institute, University of Miami, 900 NW 17th St, Miami, FL, 33136, United States, 1 305-326-6000; <email>fmedeiros@med.miami.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>6</day><month>1</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e72101</elocation-id><history><date date-type="received"><day>03</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>11</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>08</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Adriano Cypriano Faneli, Rafael Scherer, Rohit Muralidhar, Marcus Guerreiro-Filho, Luiz Beniz, Ver&#x00F4;nica Vilasboas-Campos, Douglas Costa, Alessandro A Jammal, Felipe A Medeiros. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 6.1.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e72101"/><abstract><sec><title>Background</title><p>Large language models (LLMs) have been shown to answer patient questions in ophthalmology similar to human experts. However, concerns remain regarding their use, particularly related to patient privacy and potential inaccuracies that could compromise patient safety.</p></sec><sec><title>Objective</title><p>This study aimed to compare the performance of an LLM in answering frequently asked patient questions about glaucoma with that of a small language model (SLM) trained locally on ophthalmology-specific literature.</p></sec><sec sec-type="methods"><title>Methods</title><p>We compiled 35 frequently asked questions on glaucoma, categorized into 6 domains, including pathogenesis, risk factors, clinical manifestations, diagnosis, treatment and prevention, and prognosis. Each question was posed to both a SLM using a retrieval-augmented generation framework, trained on ophthalmology-specific literature, and to a LLM (ChatGPT 4.0, OpenAI). Three glaucoma specialists from a single institution independently assessed the answers using a 3-tier accuracy rating scale: poor (score=1), borderline (score=2), and good (score=3). Each answer received a quality score ranging from 3 to 9 points based on the sum of ratings from the 3 graders. Readability grade level was assessed using 4 formulas, such as the Flesch-Kincaid Level, the Gunning Fog Index, the Coleman-Liau Index, and the Simple Measure of Gobbledygook Index.</p></sec><sec sec-type="results"><title>Results</title><p>The answers from the SLM demonstrated comparable quality with ChatGPT 4.0, scoring mean 7.9 (SD 1.2) and mean 7.4 (SD 1.5), respectively, out of a total of 9 points (<italic>P</italic>=.13). The accuracy rating was consistent overall and across all 6 glaucoma care domains. Both models provided answers considered unsuitable for health care&#x2013;related information, as they were difficult for the average layperson to read.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Both models generated accurate content, but the answers were considered challenging for the average layperson to understand, making them unsuitable for health care&#x2013;related information. Given the specialized SLM&#x2019;s comparable performance to the LLM, its high customization potential, lower cost, and ability to operate locally, it presents a viable option for deploying natural language processing in real-world ophthalmology clinical settings.</p></sec></abstract><kwd-group><kwd>online health information</kwd><kwd>ChatGPT4.0</kwd><kwd>glaucoma</kwd><kwd>large language model</kwd><kwd>small language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Recent progress in natural language processing (NLP) has been observed in health care, showcasing innovative approaches to preventive measures, diagnostics, and patient assistance. Specifically, large language models (LLMs) such as ChatGPT (OpenAI) have emerged as prominent tools in the field of ophthalmology and other medical specialties since their introduction in November 2022 [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. The conversational interface of ChatGPT and its unsupervised learning approach, particularly notable in its fourth generation, ChatGPT 4.0, has offered a novel and appealing way for patients to access medical information [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. This trend is underscored by the growing reliance on the internet for health-related information, a phenomenon that has become increasingly common among patients. A survey in the United States revealed that two-thirds of adults turn to the internet for health information, with one-third using it for self-diagnosis [<xref ref-type="bibr" rid="ref6">6</xref>]. However, despite these advancements and the increasing usage of digital resources for health information, the inability of ChatGPT to provide source citations remains a significant drawback, compromising its reliability and limiting its utility in clinical settings [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Recent literature has explored the role of LLMs in different ophthalmological scenarios. For example, Cai et al [<xref ref-type="bibr" rid="ref8">8</xref>] demonstrated strong performance of ChatGPT models in ophthalmology board-style certification questions, underscoring their educational potential in training ophthalmologists. Huang et al [<xref ref-type="bibr" rid="ref9">9</xref>] showed that ChatGPT&#x2019;s diagnostic capabilities in glaucoma could sometimes surpass those of ophthalmology residents, emphasizing their clinical utility in differential diagnosis and management. Additionally, Raghu et al [<xref ref-type="bibr" rid="ref10">10</xref>] identified the potential use of LLMs for diabetic retinopathy risk assessment, although they noted several limitations that restrict clinical deployment.</p><p>The substantial number of tasks that LLMs can perform highlights their potential for innovative research; however, the substantial computational demands for customizing these models, which may include over 100 billion parameters, present a significant challenge, making the technology largely unattainable due to computational resource limitations [<xref ref-type="bibr" rid="ref11">11</xref>]. In this context, small language models (SLMs) have emerged as a practical alternative [<xref ref-type="bibr" rid="ref12">12</xref>]. These scaled-down models offer advantages in terms of computational efficiency, ease of access, and customizability because they require fewer resources and facilitate deployment in more specific contexts [<xref ref-type="bibr" rid="ref12">12</xref>]. Their adaptability to specific needs and functions allows for the development of precise and accessible NLP tools by leveraging targeted, high-quality references, demonstrating a promising path for specialized applications [<xref ref-type="bibr" rid="ref12">12</xref>]. SLM can also be used in a closed local network without an internet connection, which diminishes the concerns about patient privacy and leakage of personal health information.</p><p>More recently, the use of retrieval-augmented generation (RAG) frameworks in natural language models has enabled precise query processing and the generation of highly accurate and relevant responses. By encoding and vectorizing documents, RAG allows language models to access external information, extending their knowledge beyond what was available in the training data. Furthermore, by integrating external data, RAG enables natural language models to effectively provide source citations, thereby bolstering the credibility of the generated content [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>Despite the growing body of literature evaluating the use of LLMs in ophthalmology, the performance of a locally deployed domain-specific SLM remains unexplored. Therefore, this study assessed the efficacy of SLM enhanced with RAG technology compared to ChatGPT 4.0 for answering common patient inquiries regarding glaucoma. Glaucoma specialists evaluated the quality of the answers, and the level of readability was assessed using standardized methods.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This study was conducted at the Ophthalmology Department of the Bascom Palmer Eye Institute (BPEI) in Miami. Patient information was not included in this study. Between January and February 2024, commonly asked questions related to glaucoma care were queried from reputable online health information outlets, such as the American Glaucoma Society (AGS) and Eye Care Forum, which enables patients to ask questions and receive answers from the American Academy of Ophthalmology (AAO)&#x2013;affiliated ophthalmologists.</p><p>Three fellowship-trained glaucoma specialists refined the first pool of 60 questions extracted from online resources by independently selecting those they considered as frequently asked in a glaucoma outpatient clinic setting. The 35 questions that all specialists considered frequent and common questions from patients with glaucoma were separated for analysis and categorized into 6 domains, such as pathogenesis, risk factors, clinical presentation, diagnosis, treatment and prevention, and prognosis (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-2"><title>Development of the Ophthalmology-Specific SLM</title><p>Our ophthalmology-specific SLM was developed based on the Hugging Face and Haystack algorithms [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. These models serve as a platform for building and deploying NLP models by performing indexing, information retrieval, and question-answering tasks. Specifically, we adopted Mistral 7B, a 7-billion-parameter model, as the SLM [<xref ref-type="bibr" rid="ref17">17</xref>]. We trained the SLM model using 60 ophthalmology books and 7862 papers from 17 MEDLINE-indexed ophthalmology journals from 2017 to 2023. This process yielded 366,924 snippets, which are succinct excerpts of information extracted from the dataset. These snippets play a crucial role in the operation of RAG, enabling the model to discern the most pertinent information required to address a given question effectively. RAG uses snippets to understand which information is most relevant to answering the specific question asked. These were provided in PDF format to Haystack [<xref ref-type="bibr" rid="ref16">16</xref>], which processed and split the text into 500-word chunks with 100 words of overlap. These word chunks were converted into model embeddings using the WhereIsAI/UAE-Large-V1 model for training [<xref ref-type="bibr" rid="ref18">18</xref>] and stored in the Haystack Facebook Artificial Intelligence Similarity Search database. This database is an open-source vector store and search engine that allows for the storage and retrieval of parts of a document relevant to the question being asked. For each question, the 3 most relevant 100-word chunks of text from the reference material were provided alongside the ophthalmology question when prompting the language models. We set the temperature to 0.5, the token limit to 500, and top-p to 1.0. We systematically searched publicly available literature databases, including PubMed and Google Scholar, using the keyword &#x201C;ophthalmology&#x201D; to construct the ophthalmology-specific dataset integrated with the RAG system. We prioritized open access documents published in peer-reviewed journals and directly relevant to clinical ophthalmic knowledge.</p></sec><sec id="s2-3"><title>Large Language Model</title><p>For comparison with LLMs, we used ChatGPT 4.0, developed by OpenAI, a 1.8 trillion-parameter LLM [<xref ref-type="bibr" rid="ref19">19</xref>]. ChatGPT is a generative artificial intelligence LLM chatbot that interacts with text and engages in human-like interactions [<xref ref-type="bibr" rid="ref19">19</xref>]. It is built on the GPT architecture and was initially trained on extensive amounts of text from books, papers, and online sources. The model&#x2019;s training process involves minimizing the difference between the expected and actual words in the dataset, enabling it to produce coherent text based on presented prompts [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Later versions, such as ChatGPT 4.0, have enhanced their functionalities, with over 1 billion users globally [<xref ref-type="bibr" rid="ref22">22</xref>]. The performance of the LLM model was assessed using the currently available online version at the time of the study, and only the first response for each question was documented. We used the same inference hyperparameters to ensure comparability with the SLM, with a temperature of 0.5, a token limit of 500, and top-p set to 1.0.</p></sec><sec id="s2-4"><title>Prompt Design</title><p>Each question was presented to the language models as a standardized prompt, following recent recommendations to maximize the performance of language models [<xref ref-type="bibr" rid="ref23">23</xref>]. A prompt acts as a clear instruction provided to a language model to generate the desired output, in our case, an answer to a question frequently asked by a patient with glaucoma. The language models were all prompted in a zero-shot fashion, meaning that no examples of questions were provided in the prompt. The prompt was specific and contextual: &#x201C;Act as a glaucoma specialist during a medical appointment and answer the following question considering it was asked by a patient.&#x201D; The same prompt was used for the SLM and LLM before each of the 35 selected questions was presented as a stand-alone query. After each query, the conversation was reset to minimize the memory retention bias. All generated responses were formatted as plain text to conceal chatbot-specific features and randomly shuffled before being presented to 3 ophthalmologists for grading of glaucoma.</p></sec><sec id="s2-5"><title>Accuracy and Quality Evaluation</title><p>Each answer was evaluated by 3 glaucoma specialists (MG, LB, and VVC). The language models&#x2019; identities were concealed to prevent bias, and the presentation order was randomized for the graders. Their main task was to individually rate the accuracy of language model responses on a 3-point scale:+1 for responses containing inaccuracies that could significantly mislead patients and potentially cause harm (ie, &#x201C;poor&#x201D;);+2 for responses with possible factual errors, but unlikely to mislead or harm patient (&#x201C;borderline&#x201D;); and +3 for &#x201C;good&#x201D; or error-free responses. Each response&#x2019;s total quality score was calculated by summing the scores of all 3 graders, with a minimum possible score of 3 and a maximum possible score of 9. In addition, we used a majority consensus approach to obtain an &#x201C;overall&#x201D; accuracy rating for each chatbot response, considering the most common rating among the 3 graders. In cases where there was no consensus among graders (ie, each grader provided a different rating), we adopted a stringent approach and assigned the lowest rating. Agreement among graders was evaluated using Fleiss kappa.</p></sec><sec id="s2-6"><title>Readability and Quality of Health Information Evaluation</title><p>To assess the readability of the chatbot answers, each answer was input into an online readability tool (Readable) [<xref ref-type="bibr" rid="ref24">24</xref>]. Four readability scales were used, including the Flesch-Kincaid Grade Level, Gunning Fog Index, Coleman-Liau Index, and Simple Measure of Gobbledygook (SMOG) Index. All readability formulas estimate the number of years of education required to fully understand a text. However, each formula uses different equations and variables to calculate it. The Flesch-Kincaid Grade Level focuses on words per sentence and syllables per word. The Gunning Fog Index considers words per sentence and syllables per word. The Coleman-Liau Index measures the average number of letters per 100 words and the average number of sentences per 100 words. The SMOG Index focuses on the number of polysyllabic words in a sample of 30 sentences.</p><p>The formula&#x2019;s output is a number, called the grade level, corresponding to the years of education required to fully understand the text. Content aimed at the public should have a grade level of around 8. Texts above 17 require a graduate-level education for complete comprehension [<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>Statistical analyses were performed using the Stata Statistical Software Release 18 (StataCorp LLC). The proportions of &#x201C;Good,&#x201D; &#x201C;Borderline,&#x201D; and &#x201C;Poor&#x201D; accuracy ratings were compared between SLM and LLM using a 2-tailed Fisher exact test. The Wilcoxon rank-sum test was used to examine the differences between the 2 language models&#x2019; overall answer quality and comprehensiveness scores. Fleiss kappa was calculated to measure interrater agreement. Statistical significance was set at <italic>P</italic>&#x003C;.05 for all analyses. Post hoc power analysis was performed to assess the observed mean difference in quality scores between the language models. We calculated the standardized effect size based on the observed means and pooled SD and estimated statistical power using a 2-tailed <italic>t</italic> test with an &#x03B1; level of .05.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>In accordance with the Declaration of Helsinki, this study did not involve patients or identifiable private information. Therefore, review and approval by the University of Miami Institutional Review Board were not required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>A total of 35 frequently asked questions from patients with glaucoma were answered by the LLM and SLM and evaluated by the 3 glaucoma specialists, and a total of 105 gradings were assigned. The interrater agreement, measured by Fleiss &#x03BA; among graders, was 0.28. The partial agreement rate between graders was 94.3% (99/105). Across the 105 individual accuracy ratings assigned to each model, the LLM had 74% (n=78) of the answers classified as good, 20% (n=21) as borderline, and 6% (n=6) as poor among the graders versus 57% (n=60), 31% (n=33), and 11% (n=12) for the SLM, respectively (<italic>P</italic>=.38). The distribution of quality scores assigned by the graders demonstrated slightly higher central tendency values for the LLM but substantial overlap between models. The median quality score was 8 (IQR 2) for the LLM and 7 (IQR 3) for the SL, indicating greater variability in evaluator scoring. The minimum and maximum observed scores were 5-9 for the LLM and 4&#x2013;9 for the SLM. No statistically significant difference was observed between the quality scores from SLM (mean 7.4, SD 1.5 points) and LLM (mean 7.9, SD 1.2 points; <italic>P</italic>=.13). Post hoc power analysis indicated that the statistical power to detect this observed difference was 32.9%. <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> details the SLM answers and the references used. <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> shows the answers provided by ChatGPT 4.0.</p><p><xref ref-type="table" rid="table1">Table 1</xref> presents an analysis of the consensus-based accuracy ratings overall and across the 6 glaucoma care domains. There was no difference in overall accuracy ratings between the language models (<italic>P</italic>=.38). For each domain, both models performed similarly in all areas. The highest performance by the SLM was in pathogenesis, with 86% (6/7) of the answers graded as &#x201C;Good,&#x201D; while the lowest was in treatment and prevention, where 28.5% (2/7) of the answers were graded as &#x201C;Poor.&#x201D; Alternatively, LLM&#x2019;s greatest performing domains were pathogenesis, treatment and prevention, and prognosis. LLM&#x2019;s worst performance domain was risk factors, where 17% (1/6) of the answers were graded as &#x201C;Poor.&#x201D;</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Consensus-based accuracy ratings of natural language models responses across glaucoma care domains.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Domain</td><td align="left" valign="bottom" rowspan="2">Number of questions</td><td align="left" valign="bottom" colspan="3">Small language model, n (%)</td><td align="left" valign="bottom" colspan="3">Large language model, n (%)</td><td align="left" valign="bottom" rowspan="2"><italic>P</italic> value</td></tr><tr><td align="left" valign="bottom">Poor</td><td align="left" valign="bottom">Borderline</td><td align="left" valign="bottom">Good</td><td align="left" valign="bottom">Poor</td><td align="left" valign="bottom">Borderline</td><td align="left" valign="bottom">Good</td></tr></thead><tbody><tr><td align="left" valign="top">Pathogenesis</td><td align="left" valign="top">7</td><td align="left" valign="top">0</td><td align="left" valign="top">1 (14)</td><td align="left" valign="top">6 (86)</td><td align="left" valign="top">1 (14)</td><td align="left" valign="top">0</td><td align="left" valign="top">6 (86)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Risk factors</td><td align="left" valign="top">6</td><td align="left" valign="top">1 (17)</td><td align="left" valign="top">2 (33)</td><td align="left" valign="top">3 (50)</td><td align="left" valign="top">1 (17)</td><td align="left" valign="top">1 (17)</td><td align="left" valign="top">4 (66)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Clinical presentation</td><td align="left" valign="top">6</td><td align="left" valign="top">1 (17)</td><td align="left" valign="top">1 (17)</td><td align="left" valign="top">4 (66)</td><td align="left" valign="top">0</td><td align="left" valign="top">3 (50)</td><td align="left" valign="top">3 (50)</td><td align="left" valign="top">.54</td></tr><tr><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td><td align="left" valign="top">1 (50)</td><td align="left" valign="top">1 (50)</td><td align="left" valign="top">0</td><td align="left" valign="top">1 (50)</td><td align="left" valign="top">1 (50)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Treatment and prevention</td><td align="left" valign="top">7</td><td align="left" valign="top">2 (28.5)</td><td align="left" valign="top">3 (44)</td><td align="left" valign="top">2 (28.5)</td><td align="left" valign="top">0</td><td align="left" valign="top">1 (14)</td><td align="left" valign="top">6 (86)</td><td align="left" valign="top">.14</td></tr><tr><td align="left" valign="top">Prognosis</td><td align="left" valign="top">7</td><td align="left" valign="top">0</td><td align="left" valign="top">3 (43)</td><td align="left" valign="top">4 (57)</td><td align="left" valign="top">0</td><td align="left" valign="top">1 (14)</td><td align="left" valign="top">6 (86)</td><td align="left" valign="top">.56</td></tr><tr><td align="left" valign="top">Overall</td><td align="left" valign="top">35</td><td align="left" valign="top">4 (11.55)</td><td align="left" valign="top">11 (31.5)</td><td align="left" valign="top">20 (57)</td><td align="left" valign="top">2 (6)</td><td align="left" valign="top">7 (20)</td><td align="left" valign="top">26 (74)</td><td align="left" valign="top">.38</td></tr></tbody></table></table-wrap><p><xref ref-type="table" rid="table2">Table 2</xref> shows the quality scores for each natural language model overall and throughout the 6 glaucoma care domains. The overall quality scores for the SLM and LLM were 258 and 277 (<italic>P</italic>=.13), respectively. The differences in quality scores between all the glaucoma care domains were not statistically significant.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Consensus-based quality scores of natural language models responses across glaucoma care domains.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Domain</td><td align="left" valign="bottom" rowspan="2">Number of questions</td><td align="left" valign="bottom" colspan="2">Quality scores</td><td align="left" valign="bottom" rowspan="2"><italic>P</italic> value</td></tr><tr><td align="left" valign="bottom">Small language model</td><td align="left" valign="top">Large language model</td></tr></thead><tbody><tr><td align="left" valign="top">Pathogenesis</td><td align="left" valign="top">7</td><td align="left" valign="top">58</td><td align="left" valign="top">56</td><td align="left" valign="top">.62</td></tr><tr><td align="left" valign="top">Risk factors</td><td align="left" valign="top">6</td><td align="left" valign="top">41</td><td align="left" valign="top">46</td><td align="left" valign="top">.40</td></tr><tr><td align="left" valign="top">Clinical presentation</td><td align="left" valign="top">6</td><td align="left" valign="top">46</td><td align="left" valign="top">46</td><td align="left" valign="top">.87</td></tr><tr><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">2</td><td align="left" valign="top">15</td><td align="left" valign="top">14</td><td align="left" valign="top">.68</td></tr><tr><td align="left" valign="top">Treatment and prevention</td><td align="left" valign="top">7</td><td align="left" valign="top">46</td><td align="left" valign="top">58</td><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top">Prognosis</td><td align="left" valign="top">7</td><td align="left" valign="top">52</td><td align="left" valign="top">57</td><td align="left" valign="top">.45</td></tr><tr><td align="left" valign="top">Overall</td><td align="left" valign="top">35</td><td align="left" valign="top">258</td><td align="left" valign="top">277</td><td align="left" valign="top">.13</td></tr></tbody></table></table-wrap><p><xref ref-type="table" rid="table3">Table 3</xref> summarizes the readability scores of the responses for each natural language model. The mean Flesch-Kincaid grade level was 13.2 (SD 3.2) for the SLM and 11.8 (SD 2.2) for the LLM. For the Gunning Fog Index, mean scores were 17.7 (SD 4.3) for the SLM and 14.4 (SD 3.0) for the LLM. The mean results of the Coleman-Liau Index were 14.7 (SD 3.0) for the SLM compared to 12.5 (SD 1.5) for the LLM. The mean scores of the SMOG Index were recorded as 15.98 (SD 2.9) for the SLM and 13.9 (SD 2.1) for the LLM. In all 4 readability classification systems, the SLM had statistically significantly higher scores (<italic>P</italic>&#x003C;.001).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Mean readability grade level for small language model and large language model responses<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Readability scores</td><td align="left" valign="bottom">Flesch-Kincaid grade level, mean (SD)</td><td align="left" valign="bottom">Gunning fog index, mean (SD)</td><td align="left" valign="bottom">Coleman-Liau index, mean (SD)</td><td align="left" valign="bottom">Simple measure of gobbledygook (SMOG) Index, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">SLM<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">13.2 (3.2)</td><td align="left" valign="top">17.7 (4.3)</td><td align="left" valign="top">14.7 (3.0)</td><td align="left" valign="top">15.98 (2.9)</td></tr><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">11.8 (2.2)</td><td align="left" valign="top">14.4 (3.0)</td><td align="left" valign="top">12.2 (1.5)</td><td align="left" valign="top">13.9 (2.1)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup><italic>P</italic>&#x003C;.001 in all 3 comparisions. </p></fn><fn id="table3fn2"><p><sup>b</sup>SLM: small language model.</p></fn><fn id="table3fn3"><p><sup>c</sup>LLM: large language model. </p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we developed and evaluated an SLM trained specifically in ophthalmology to yield clinically relevant information and answer frequently asked questions about glaucoma. The responses provided by our model were as accurate as ChatGPT 4.0, an LLM trained with billions of parameters, as evaluated by glaucoma specialists. To the best of our knowledge, this is the first study to compare the performance of an SLM powered by RAG with ChatGPT 4.0, demonstrating the feasibility of using a local model to answer frequently asked questions about glaucoma and provide references for further reading.</p><p>The answers from the SLM developed in this study achieved a mean quality score of 7.4 (SD 1.5) points, which was comparable to the mean quality score of the LLM (7.9, SD 1.2 points out of a total of 9 points; <italic>P</italic>=.13). Moreover, the consensus-based accuracy ratings for the answers of both natural language models were also considered equivalent (<italic>P</italic>=.38). The performance of SLM was also comparable in all 6 glaucoma domains studied, including pathogenesis, risk factors, clinical presentation, diagnosis, treatment and prevention, and prognosis. These results highlight the potential role of SLMs in ophthalmology practice, as they offer a more affordable, adaptable, and straightforward integration into actual ophthalmology clinics. Furthermore, unlike ChatGPT 4.0, which is not open-source and refines its model using user-provided information, SLMs can be trained and operated locally within an institution, significantly reducing the risk of sensitive information leakage, making them a more realistic choice for future integration of natural language models in practical settings [<xref ref-type="bibr" rid="ref12">12</xref>]. A previous study by Sharir et al [<xref ref-type="bibr" rid="ref26">26</xref>] estimated the cost of US $80,000 per 1.5 billion parameter model. In this context, training a model such as ChatGPT 4.0 would require US $96,000,000, while an SLM such as the one used in our study would require US $373,000, a more realistic amount for many institutions worldwide [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>The use of natural language models in artificial intelligence&#x2013;driven chatbots has increasingly infiltrated daily life [<xref ref-type="bibr" rid="ref27">27</xref>]. The ability of these models to provide immediate answers across a wide array of inquiries has garnered considerable interest in the health care sector [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. In ophthalmology practice, one of the most relevant applications of natural language models is responding to patient queries commonly encountered in practice [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. Lim et al [<xref ref-type="bibr" rid="ref32">32</xref>] compared the performance of 3 different LLMs in answering frequent questions about myopia. Using a 3-level grading scale similar to our study (poor, borderline, and good), they reported mean total scores of 8.19 (SD 1.14) for ChatGPT-4.0, 7.35 (SD 1.70) for ChatGPT-3.5, and 7.13 (SD 1.63) for Google Bard. Regarding categorical ratings, 80.6% of ChatGPT-4.0 responses were classified as &#x201C;good,&#x201D; compared to 61.3% for ChatGPT-3.5% and 54.8% for Google Bard. Our findings, with mean total scores of 7.9 (SD 1.2) points for the LLM (ChatGPT-4.0) and 7.4 (SD 1.5) points for the ophthalmology-specific SLM, align closely with these previous results. Furthermore, the proportion of responses classified as &#x201C;good&#x201D; in our study (78/105, 74% for the LLM and 60/105, 57% for the SLM) is consistent with previously reported results also by Lim et al [<xref ref-type="bibr" rid="ref32">32</xref>]. While Momenaei et al [<xref ref-type="bibr" rid="ref33">33</xref>] evaluated ChatGPT 4.0&#x2019;s ability to address retinal disease queries, responses were considered appropriate in 84.6%, 92%, and 91.7% of the questions concerning retinal detachments, macular holes, and epiretinal membranes, respectively. In both instances, the ChatGPT 4.0 responses were graded by different groups of ophthalmologists as consistently appropriate. Despite these positive results, LLMs, such as ChatGPT, are often expensive, inflexible, and unfeasible to implement in local contexts. Recent advancements in NLP also include multimodal LLMs [<xref ref-type="bibr" rid="ref34">34</xref>]. For instance, Choi et al [<xref ref-type="bibr" rid="ref34">34</xref>] successfully used multimodal language models to integrate structured ocular data to calculate safety indicators and predict contraindications in laser vision correction procedures. Their results indicated superior accuracy and flexibility compared to traditional machine learning approaches, underscoring significant clinical potential. Despite these encouraging outcomes, practical challenges remain regarding the broader implementation of such advanced technologies in clinical settings. Specifically, multimodal models often require significant computational resources, entail high costs, and may raise concerns about data security and patient privacy. Thus, while multimodal approaches offer considerable promise, specialized smaller scale models, such as the SLM presented in our study, represent a cheaper and feasible solution for real-world deployment, balancing accuracy, adaptability, cost-efficiency, and local data control.</p><p>One major concern of implementing ChatGPT in clinical settings is its lack of ability to provide source citations [<xref ref-type="bibr" rid="ref35">35</xref>]. Studies have indicated that ChatGPT often provides false references for its generated responses, leading to concerns over response reliability and the risk of inaccuracies [<xref ref-type="bibr" rid="ref36">36</xref>]. In contrast, the combination of RAG with SLM guarantees the citation of all sources, offering clear evidence for shared information. This ability is a crucial benefit of SLM in clinical contexts, enhancing its utility in delivering reliable, evidence-supported information to patients. Unlike ChatGPT 4.0, which cannot cite references for its responses, SLM equipped with RAG can specify the exact reference and its metadata, including DOI, publication year, and journal name, used to generate a response. The ability to locally deploy domain-specific SLMs with RAG opens several avenues for real-world clinical use. In ophthalmology clinics, SLMs could serve as virtual assistants capable of providing preliminary education to patients, addressing common concerns before or after consultations, and supporting decision-making through curated literature. This could reduce physician workload and improve information retention. These systems could also be embedded in telemedicine platforms or patient portals to enhance access to personalized, trustworthy, and reference-backed content, especially for chronic conditions like glaucoma.</p><p>Although our study did not directly compare the models&#x2019; responses to responses by human experts, recent evidence suggests that language models may already be approaching human-level performance in natural language generation [<xref ref-type="bibr" rid="ref37">37</xref>]. A preprint by Jones et al [<xref ref-type="bibr" rid="ref37">37</xref>] demonstrated that when appropriately prompted to adopt a human persona, state-of-the-art LLMs were judged to be the human more often than real human participants in a controlled 3-party Turing test, effectively passing the original Turing test design. These findings imply that, at least in open-ended conversational tasks, language models may generate responses that are indistinguishable from those of real people. While this supports the plausibility of expert-level performance in patient education tasks, further research is required to compare model-generated content to clinician-authored responses within ophthalmology-specific domains directly.</p><p>Previous studies have shown that natural language models often generate grammatically correct responses to common patient inquiries [<xref ref-type="bibr" rid="ref38">38</xref>]. However, these answers are complex and difficult for the average layperson to understand fully [<xref ref-type="bibr" rid="ref39">39</xref>]. The American Medical Association recommends that health-related information be communicated at a grade level score of 5-6, which is equivalent to the reading level of fifth- to sixth-graders [<xref ref-type="bibr" rid="ref40">40</xref>]. Previous research has indicated that information on glaucoma available online is often written at a grade level that is not suitable for health-related information [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]. Our analysis revealed that the answers from both LLM and SLM share the same limitation of requiring high-level education to fully understand the answers. In our study, the grade level mean scores, measured by the Flesch-Kincaid Grade Level, the Gunning Fog Index, the Coleman-Liau Index, and the SMOG Index, were 13.2 (SD 3.2), 17.7 (SD 4.3), 14.7 (SD 3.0), and 15.98 (SD 2.9), respectively, for the SLM, and 11.8 (SD 2.2), 14.4 (SD 3.0), 12.5 (SD 1.5), and 13.9 (SD 2.1) for the LLM. The SLM had a statistically significantly higher grade level in all 4 metrics (<italic>P</italic>&#x003C;.001). This finding is associated with the usage of scientific resources only as the source material for the SLM responses, as this material is written at an academic level.</p><p>This study had several limitations. It was conducted with a limited set of questions, focusing solely on a single ophthalmological condition evaluated by a small panel of 3 glaucoma specialists within a single institution. A multicenter evaluation on a larger dataset of questions would offer additional insights into the performance of the SLM powered with RAG versus LLM in answering questions frequently asked by patients with glaucoma. Moreover, this study did not directly assess patient response evaluations. Future studies measuring patients&#x2019; opinions on the clarity and quality of the answers could reveal more details regarding using natural language models as a tool for answering glaucoma-related questions. Additionally, the model was not designed exclusively to respond to frequently asked questions about glaucoma but was trained to address ophthalmological inquiries in a broader and more technical context. This approach could have resulted in an underestimation of the SLM&#x2019;s performance. However, this study stands as proof of concept, and the SLM can be further tailored to specific tasks and other domains in ophthalmology. Furthermore, the post hoc power analysis shows that the sample size of 35 questions provided only 32.9% power to detect the observed difference in quality scores. This indicates a high risk of a type II error, suggesting that the lack of statistical significance may be due to insufficient power rather than equivalence in model performance. Future studies with larger sample sizes are needed to assess potential differences between SLM and LLM performances more robustly. Moreover, the prompt did not contain specific instructions to generate answers to a particular grade level, which could generate more easily understood questions and should be explored by future studies. Finally, this study did not include a direct comparison between the responses generated by the language models and human experts. Future research should evaluate how SLM and LLM outputs compare to clinician-authored answers regarding accuracy, appropriateness, and patient comprehension.</p></sec><sec id="s4-2"><title>Conclusion</title><p>In conclusion, our study revealed that a specialized SLM may be able to perform similarly to an LLM in answering frequently asked glaucoma questions. However, their answers were unsuitable for health care&#x2013;related information, as they would be difficult for the average layperson to comprehend. Given their comparable performance to LLMs, high customization potential, ability to provide citations, low cost, and capacity to operate locally without collecting sensitive data, specialized SLMs may present as a realistic option for deploying NLP in real-world ophthalmology clinical settings. Further research is needed to investigate the incorporation of health care&#x2013;related texts with greater readability into SLMs, as they could be more easily adapted to generate accurate and easy-to-understand answers.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>No external financial support or grants were received from any public, commercial, or not-for-profit entities for the research, authorship, or publication of this article.</p></sec><sec><title>Data Availability</title><p>The datasets generated and analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: ACF, RS</p><p>Data curation: ACF, RS, RM, MGF, LB, VVC</p><p>Formal analysis: ACF, AAJ</p><p>Methodology: ACF, RS, AAJ</p><p>Investigation: ACF, RS, DC, MGF, LB, VVC</p><p>Project administration: FAM, AAJ</p><p>Resources: ACF, RS, DC, MGF, LB, VVC</p><p>Software: ACF, RS, RM</p><p>Supervision: RS, AAJ, FAM</p><p>Validation: ACF, RS</p><p>Visualization: ACF</p><p>Writing&#x2013;original draft: ACF</p><p>Writing&#x2013;review &#x0026; editing: All authors critically revised the manuscript and approved the final version.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AAO </term><def><p>American Academy of Ophthalmology</p></def></def-item><def-item><term id="abb2">AGS</term><def><p>American Glaucoma Society</p></def></def-item><def-item><term id="abb3">BPEI</term><def><p>Bascom Palmer Eye Institute</p></def></def-item><def-item><term id="abb4">LLM </term><def><p>large language model</p></def></def-item><def-item><term id="abb5">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb6">RAG </term><def><p>retrieval-augmented generation</p></def></def-item><def-item><term id="abb7">SLM</term><def><p>small language model</p></def></def-item><def-item><term id="abb8">SMOG</term><def><p>Simple Measure of Gobbledygook</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanders</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>K</given-names> </name><name name-style="western"><surname>Chow</surname><given-names>JCL</given-names> </name></person-group><article-title>Chatbot for healthcare and oncology applications using artificial intelligence and machine learning: systematic review</article-title><source>JMIR Cancer</source><year>2021</year><month>11</month><day>29</day><volume>7</volume><issue>4</issue><fpage>e27850</fpage><pub-id pub-id-type="doi">10.2196/27850</pub-id><pub-id pub-id-type="medline">34847056</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shemer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Altarescu</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Diagnostic capabilities of ChatGPT in ophthalmology</article-title><source>Graefes Arch Clin Exp Ophthalmol</source><year>2024</year><month>07</month><volume>262</volume><issue>7</issue><fpage>2345</fpage><lpage>2352</lpage><pub-id pub-id-type="doi">10.1007/s00417-023-06363-z</pub-id><pub-id pub-id-type="medline">38183467</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Antaki</surname><given-names>F</given-names> </name><name name-style="western"><surname>Touma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Milad</surname><given-names>D</given-names> </name><name name-style="western"><surname>El-Khoury</surname><given-names>J</given-names> </name><name name-style="western"><surname>Duval</surname><given-names>R</given-names> </name></person-group><article-title>Evaluating the performance of ChatGPT in ophthalmology: an analysis of its successes and shortcomings</article-title><source>Ophthalmol Sci</source><year>2023</year><month>12</month><volume>3</volume><issue>4</issue><fpage>100324</fpage><pub-id pub-id-type="doi">10.1016/j.xops.2023.100324</pub-id><pub-id pub-id-type="medline">37334036</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Angelis</surname><given-names>L</given-names> </name><name name-style="western"><surname>Baglivo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Arzilli</surname><given-names>G</given-names> </name><etal/></person-group><article-title>ChatGPT and the rise of large language models: the new AI-driven infodemic threat in public health</article-title><source>Front Public Health</source><year>2023</year><volume>11</volume><fpage>1166120</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2023.1166120</pub-id><pub-id pub-id-type="medline">37181697</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuehn</surname><given-names>BM</given-names> </name></person-group><article-title>More than one-third of US individuals use the internet to self-diagnose</article-title><source>JAMA</source><year>2013</year><month>02</month><day>27</day><volume>309</volume><issue>8</issue><fpage>756</fpage><pub-id pub-id-type="doi">10.1001/jama.2013.629</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Dis</surname><given-names>EAM</given-names> </name><name name-style="western"><surname>Bollen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zuidema</surname><given-names>W</given-names> </name><name name-style="western"><surname>van Rooij</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bockting</surname><given-names>CL</given-names> </name></person-group><article-title>ChatGPT: five priorities for research</article-title><source>Nature New Biol</source><year>2023</year><month>02</month><volume>614</volume><issue>7947</issue><fpage>224</fpage><lpage>226</lpage><pub-id pub-id-type="doi">10.1038/d41586-023-00288-7</pub-id><pub-id pub-id-type="medline">36737653</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cai</surname><given-names>LZ</given-names> </name><name name-style="western"><surname>Shaheen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of generative large language models on ophthalmology board-style questions</article-title><source>Am J Ophthalmol</source><year>2023</year><month>10</month><volume>254</volume><fpage>141</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1016/j.ajo.2023.05.024</pub-id><pub-id pub-id-type="medline">37339728</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Hirabayashi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Barna</surname><given-names>L</given-names> </name><name name-style="western"><surname>Parikh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pasquale</surname><given-names>LR</given-names> </name></person-group><article-title>Assessment of a Large Language model&#x2019;s responses to questions and cases about glaucoma and retina management</article-title><source>JAMA Ophthalmol</source><year>2024</year><month>04</month><day>1</day><volume>142</volume><issue>4</issue><fpage>371</fpage><lpage>375</lpage><pub-id pub-id-type="doi">10.1001/jamaophthalmol.2023.6917</pub-id><pub-id pub-id-type="medline">38386351</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raghu</surname><given-names>K</given-names> </name><name name-style="western"><surname>S</surname><given-names>T</given-names> </name><name name-style="western"><surname>S Devishamani</surname><given-names>C</given-names> </name><name name-style="western"><surname>M</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rajalakshmi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Raman</surname><given-names>R</given-names> </name></person-group><article-title>The utility of ChatGPT in diabetic retinopathy risk assessment: a comparative study with clinical diagnosis</article-title><source>Clin Ophthalmol</source><year>2023</year><volume>17</volume><fpage>4021</fpage><lpage>4031</lpage><pub-id pub-id-type="doi">10.2147/OPTH.S435052</pub-id><pub-id pub-id-type="medline">38164506</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tay</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bommasani</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Emergent abilities of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 26, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2206.07682</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sabharwal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khot</surname><given-names>T</given-names> </name></person-group><article-title>Specializing smaller language models towards multi-step reasoning</article-title><access-date>2025-11-30</access-date><conf-name>Proceedings of the 40th International Conference on Machine Learning</conf-name><conf-date>Jul 23-29, 2023</conf-date><conf-loc>Honolulu, HI</conf-loc><fpage>10421</fpage><lpage>10430</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v202/fu23d.html">https://proceedings.mlr.press/v202/fu23d.html</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name></person-group><article-title>Augmenting black-box llms with medical textbooks for biomedical question answering</article-title><year>2023</year><access-date>2025-11-30</access-date><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Nov 12-16, 2024</conf-date><conf-loc>Miami, FL</conf-loc><fpage>1754</fpage><lpage>1770</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.findings-emnlp">https://aclanthology.org/2024.findings-emnlp</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2024.findings-emnlp.95</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lozano</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fleming</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Chiang</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>N</given-names> </name></person-group><article-title>Clinfo.ai: an open-source retrieval-augmented large language model system for answering medical questions using scientific literature</article-title><source>Pac Symp Biocomput</source><year>2024</year><volume>29</volume><fpage>8</fpage><lpage>23</lpage><pub-id pub-id-type="medline">38160266</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><etal/></person-group><article-title>HuggingFace&#x2019;s transformers: state-of-the-art natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 14, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.03771</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Pietsch</surname><given-names>M</given-names> </name><name name-style="western"><surname>M&#x00F6;ller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kostic</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Haystack</article-title><source>GitHub</source><access-date>2025-11-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/deepset-ai/haystack">https://github.com/deepset-ai/haystack</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.06825</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xa</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name></person-group><article-title>AnglE-optimized text embeddings</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.12871</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Gpt-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 22, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ouyang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Training language models to follow instructions with human feedback</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.02155</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stokel-Walker</surname><given-names>C</given-names> </name><name name-style="western"><surname>Van Noorden</surname><given-names>R</given-names> </name></person-group><article-title>What ChatGPT and generative AI mean for science</article-title><source>Nature New Biol</source><year>2023</year><month>02</month><volume>614</volume><issue>7947</issue><fpage>214</fpage><lpage>216</lpage><pub-id pub-id-type="doi">10.1038/d41586-023-00340-6</pub-id><pub-id pub-id-type="medline">36747115</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>Prompt engineering as an important emerging skill for medical professionals: tutorial</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>4</day><volume>25</volume><fpage>e50638</fpage><pub-id pub-id-type="doi">10.2196/50638</pub-id><pub-id pub-id-type="medline">37792434</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><source>Readable</source><access-date>2025-11-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://app.readable.com/text/">https://app.readable.com/text/</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Kloosterboer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yannuzzi</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Venkateswaran</surname><given-names>N</given-names> </name><name name-style="western"><surname>Sridhar</surname><given-names>J</given-names> </name></person-group><article-title>Evaluation of the content, quality, and readability of patient accessible online resources regarding cataracts</article-title><source>Semin Ophthalmol</source><year>2021</year><month>08</month><day>18</day><volume>36</volume><issue>5-6</issue><fpage>384</fpage><lpage>391</lpage><pub-id pub-id-type="doi">10.1080/08820538.2021.1893758</pub-id><pub-id pub-id-type="medline">33634726</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sharir</surname><given-names>O</given-names> </name><name name-style="western"><surname>Peleg</surname><given-names>B</given-names> </name><name name-style="western"><surname>Shoham</surname><given-names>Y</given-names> </name></person-group><article-title>The cost of training nlp models: a concise overview</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 19, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.08900</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jingfeng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hongye</surname><given-names>JIN</given-names> </name><name name-style="western"><surname>Ruixiang</surname><given-names>T</given-names> </name></person-group><article-title>Harnessing the power of LLMs in practice: a survey on chatgpt and beyond</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.13712</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name></person-group><article-title>The rise of ChatGPT: exploring its potential in medical education</article-title><source>Anat Sci Educ</source><year>2024</year><volume>17</volume><issue>5</issue><fpage>926</fpage><lpage>931</lpage><pub-id pub-id-type="doi">10.1002/ase.2270</pub-id><pub-id pub-id-type="medline">36916887</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sallam</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title><source>Healthcare (Basel)</source><year>2023</year><month>03</month><day>19</day><volume>11</volume><issue>6</issue><fpage>887</fpage><pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id><pub-id pub-id-type="medline">36981544</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>C</given-names> </name></person-group><article-title>Leveraging generative AI and large language models: a comprehensive roadmap for healthcare integration</article-title><source>Healthcare (Basel)</source><year>2023</year><month>10</month><day>20</day><volume>11</volume><issue>20</issue><fpage>2776</fpage><pub-id pub-id-type="doi">10.3390/healthcare11202776</pub-id><pub-id pub-id-type="medline">37893850</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bernstein</surname><given-names>IA</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>YV</given-names> </name><name name-style="western"><surname>Govil</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Comparison of ophthalmologist and large language model chatbot responses to online patient eye care questions</article-title><source>JAMA Netw Open</source><year>2023</year><month>08</month><day>1</day><volume>6</volume><issue>8</issue><fpage>e2330320</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.30320</pub-id><pub-id pub-id-type="medline">37606922</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>ZW</given-names> </name><name name-style="western"><surname>Pushpanathan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yew</surname><given-names>SME</given-names> </name><etal/></person-group><article-title>Benchmarking large language models&#x2019; performances for myopia care: a comparative analysis of ChatGPT-3.5, ChatGPT-4.0, and Google Bard</article-title><source>EBioMedicine</source><year>2023</year><month>09</month><volume>95</volume><fpage>104770</fpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2023.104770</pub-id><pub-id pub-id-type="medline">37625267</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Momenaei</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wakabayashi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shahlaee</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Appropriateness and readability of ChatGPT-4&#x2013;generated responses for surgical treatment of retinal diseases</article-title><source>Ophthalmol Retina</source><year>2023</year><month>10</month><volume>7</volume><issue>10</issue><fpage>862</fpage><lpage>868</lpage><pub-id pub-id-type="doi">10.1016/j.oret.2023.05.022</pub-id><pub-id pub-id-type="medline">37277096</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yoo</surname><given-names>TK</given-names> </name></person-group><article-title>Application of multimodal large language models for safety indicator calculation and contraindication prediction in laser vision correction</article-title><source>NPJ Digit Med</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>1</issue><fpage>82</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01487-4</pub-id><pub-id pub-id-type="medline">39900802</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhattacharyya</surname><given-names>M</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>VM</given-names> </name><name name-style="western"><surname>Bhattacharyya</surname><given-names>D</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>LE</given-names> </name></person-group><article-title>High rates of fabricated and inaccurate references in ChatGPT&#x2010;generated medical content</article-title><source>Cureus</source><year>2023</year><month>05</month><volume>15</volume><issue>5</issue><fpage>e39238</fpage><pub-id pub-id-type="doi">10.7759/cureus.39238</pub-id><pub-id pub-id-type="medline">37337480</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walters</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Wilder</surname><given-names>EI</given-names> </name></person-group><article-title>Fabrication and errors in the bibliographic citations generated by ChatGPT</article-title><source>Sci Rep</source><year>2023</year><month>09</month><day>7</day><volume>13</volume><issue>1</issue><fpage>14045</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-41032-5</pub-id><pub-id pub-id-type="medline">37679503</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Bergen</surname><given-names>BK</given-names> </name></person-group><article-title>Large language models pass the turing test</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 31, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.23674</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>M</given-names> </name></person-group><article-title>The role of ChatGPT in scientific communication: writing better scientific review articles</article-title><source>Am J Cancer Res</source><year>2023</year><volume>13</volume><issue>4</issue><fpage>1148</fpage><lpage>1154</lpage><pub-id pub-id-type="medline">37168339</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kianian</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>D</given-names> </name><name name-style="western"><surname>Giaconi</surname><given-names>J</given-names> </name></person-group><article-title>Can ChatGPT aid clinicians in educating patients on the surgical management of glaucoma</article-title><source>J Glaucoma</source><year>2024</year><month>02</month><day>1</day><volume>33</volume><issue>2</issue><fpage>94</fpage><lpage>100</lpage><pub-id pub-id-type="doi">10.1097/IJG.0000000000002338</pub-id><pub-id pub-id-type="medline">38031276</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Weiss</surname><given-names>B</given-names> </name></person-group><source>Health Literacy: A Manual for Clinicians</source><year>2003</year><publisher-name>American Medical Association Foundation and American Medical Association</publisher-name></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Martin</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Readability and suitability of online patient education materials for glaucoma</article-title><source>Ophthalmol Glaucoma</source><year>2022</year><volume>5</volume><issue>5</issue><fpage>525</fpage><lpage>530</lpage><pub-id pub-id-type="doi">10.1016/j.ogla.2022.03.004</pub-id><pub-id pub-id-type="medline">35301989</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jia</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Shukla</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Razeghinejad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Myers</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Kolomeyer</surname><given-names>NN</given-names> </name></person-group><article-title>What glaucoma patients are reading on the internet: a systematic analysis of online glaucoma content</article-title><source>Ophthalmol Glaucoma</source><year>2022</year><volume>5</volume><issue>4</issue><fpage>447</fpage><lpage>451</lpage><pub-id pub-id-type="doi">10.1016/j.ogla.2022.01.002</pub-id><pub-id pub-id-type="medline">35114429</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mahajan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Oydanich</surname><given-names>M</given-names> </name><name name-style="western"><surname>Khouri</surname><given-names>AS</given-names> </name></person-group><article-title>A comprehensive evaluation of the quality, readability, and technical quality of online information on glaucoma</article-title><source>Ophthalmol Glaucoma</source><year>2023</year><volume>6</volume><issue>1</issue><fpage>93</fpage><lpage>99</lpage><pub-id pub-id-type="doi">10.1016/j.ogla.2022.07.007</pub-id><pub-id pub-id-type="medline">35940574</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>List of the 35 frequently asked questions from patients with glaucoma used in the study.</p><media xlink:href="ai_v5i1e72101_app1.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Small language model answers and the references used.</p><media xlink:href="ai_v5i1e72101_app2.xlsx" xlink:title="XLSX File, 99 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3 </label><p>Responses generated by ChatGPT 4.0.</p><media xlink:href="ai_v5i1e72101_app3.xlsx" xlink:title="XLSX File, 16 KB"/></supplementary-material></app-group></back></article>