<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e83640</article-id><article-id pub-id-type="doi">10.2196/83640</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of Large Language Models Under Input Variability in Health Care Applications: Dataset Development and Experimental Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Joshi</surname><given-names>Saubhagya</given-names></name><degrees>BEng, MEng</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mehta</surname><given-names>Monjil</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Maniar</surname><given-names>Sarjak</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Mengqian</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Singh</surname><given-names>Vivek Kumar</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Library and Information Sciences, School of Communication &#x0026; Information, Rutgers University</institution><addr-line>4 Huntington St</addr-line><addr-line>New Brunswick</addr-line><addr-line>NJ</addr-line><country>United States</country></aff><aff id="aff2"><institution>Rutgers Business School, Rutgers University</institution><addr-line>New Brunswick</addr-line><addr-line>NJ</addr-line><country>United States</country></aff><aff id="aff3"><institution>University of North Carolina, Chapel Hill</institution><addr-line>Chapel Hill</addr-line><addr-line>NC</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Liu</surname><given-names>Hongfang</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ogut</surname><given-names>Eren</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mahmoud</surname><given-names>Randa Salah Gomaa</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Adeniyi</surname><given-names>Samson</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Saubhagya Joshi, BEng, MEng, Library and Information Sciences, School of Communication &#x0026; Information, Rutgers University, 4 Huntington St, New Brunswick, NJ, 08901, United States, +1 (848) 932-7500; <email>srj96@scarletmail.rutgers.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>20</day><month>2</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e83640</elocation-id><history><date date-type="received"><day>05</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>30</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Saubhagya Joshi, Monjil Mehta, Sarjak Maniar, Mengqian Wang, Vivek Kumar Singh. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 20.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e83640"/><abstract><sec><title>Background</title><p>Large language models (LLMs) are increasingly integrated into health care, where they contribute to patient care, administrative efficiency, and clinical decision-making. Despite their growing role, the ability of LLMs to handle imperfect inputs remains underexplored. These imperfections, which are common in clinical documentation and patient-generated data, may affect model reliability.</p></sec><sec><title>Objective</title><p>This study investigates the impact of input perturbations on LLM performance across three dimensions: (1) overall effectiveness in different health-related applications, (2) comparative effects of different types and levels of perturbations, and (3) differential impact of perturbations on health-related terms versus non&#x2013;health-related terms.</p></sec><sec sec-type="methods"><title>Methods</title><p>We systematically evaluate 3 LLMs on 3 health-related tasks using a novel dataset containing 3 types of human-like variations (redaction, homophones, and typographical errors) at different perturbation levels.</p></sec><sec sec-type="results"><title>Results</title><p>Contrary to expectations, LLMs demonstrate notable robustness to common variations, and in more than half of the cases (151/270, 55.92%), the performance was stable or improved. In some cases (38/270, 14.07%), variations resulted in an increased performance, especially when dealing with lower perturbation levels. Redactions, often stemming from privacy concerns or cognitive lapses, are more detrimental than other variations.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings highlight the need for health care applications powered by LLMs to be designed with input variability in mind. Robustness to noisy or imperfect inputs is essential for maintaining reliability in real-world clinical settings, where data quality can vary widely. By identifying specific vulnerabilities and strengths, this study provides actionable insights for improving model resilience and guiding the development of safer, more effective artificial intelligence tools in health care. The accompanying dataset offers a valuable resource for further research into LLM performance under diverse conditions.</p></sec></abstract><kwd-group><kwd>dataset</kwd><kwd>error analysis</kwd><kwd>health informatics</kwd><kwd>large language models</kwd><kwd>robustness</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Large language models (LLMs) have revolutionized health care by enhancing patient care, streamlining administrative processes, and advancing medical research. For instance, LLMs are used to analyze patient data for disease management, assist in prior authorization processes by summarizing extensive clinical records, and even support clinical decision-making by providing diagnosis suggestions. Additionally, ambient scribing, where LLMs generate clinical notes from doctor-patient interactions, can significantly reduce the administrative burden on health care providers [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>In such an increasingly LLM-mediated health care ecosystem, understanding LLM robustness&#x2014;defined as the model&#x2019;s ability to maintain consistent and accurate performance despite the presence of noise, ambiguity, or variability in input text&#x2014;is crucial. This includes resilience to typographical errors, domain-specific jargon, abbreviations, and other real-world linguistic variations commonly found in clinical and patient-generated text [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Such real-world scenarios often invalidate the foundational assumption of many natural language processing systems that rely on clean datasets, especially in health care, where users are frequently tired, sick, or cognitively impaired, and may redact personal information from their queries, either accidentally or intentionally [<xref ref-type="bibr" rid="ref5">5</xref>]. For instance, related studies suggest that roughly 14% of health search queries contain typos [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>As a result, evaluations conducted under ideal conditions are not reflective of health care applications, where robustness to input variations is essential. Most existing evaluations focus on nonmedical contexts and do not prioritize robustness, sometimes narrowly construing robustness analysis as attempts to exploit the system for unsafe responses. An often-understudied approach is to assess how everyday users, seeking health information from LLMs, naturally introduce variations and errors in their inputs. Past research has identified several types of errors users make when typing search queries, including phonetic, morphological, orthographic, cognitive, and keyboard layout errors [<xref ref-type="bibr" rid="ref6">6</xref>]. While there is significant literature on these variations in human input, their impact on LLM performance in health settings remains understudied. In this study, we systematically examine the effects of 3 common types of human variations and levels of perturbation on the performance of LLMs across various health-related tasks. To do so, we also develop and share a novel health&#x2013;centric dataset with several types of human variations [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. We systematically evaluate the resilience of 3 different LLMs to variations of the original text, particularly in health care contexts. As LLMs continue to be increasingly adopted in health care contexts by professionals and patients alike [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>], studies like ours are important to find acceptable limits and set guardrails where necessary.</p><p>Contrary to expectations, our findings suggest that LLMs are quite robust to common textual perturbations. Furthermore, the impact of these perturbations is far from monotonic&#x2014;that is, performance does not consistently degrade as input quality worsens. In fact, we observed several instances where model performance improved following certain perturbations. Among the 3 types of variations we studied, redaction&#x2014;the removal or masking of information, often due to privacy concerns or cognitive lapses [<xref ref-type="bibr" rid="ref12">12</xref>]&#x2014;proved to be the most detrimental. This was followed by homophones (words that sound alike but have different meanings or spellings, such as &#x201C;heal&#x201D; vs &#x201C;heel&#x201D;) and typographical errors (misspellings or accidental keystrokes, such as &#x201C;diabtes&#x201D; instead of &#x201C;diabetes&#x201D;), which had comparatively milder effects. We hope that our perturbation dataset (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and the findings will stimulate more research at the intersection of robustness and health using LLMs. Ensuring that LLMs can handle the imperfect real-world data encountered in health care settings will enhance their reliability and effectiveness, ultimately supporting better patient outcomes.</p></sec><sec id="s1-2"><title>Related Work</title><p>LLMs have made significant strides in health care applications, such as clinical decision support, medical question answering, and patient education, which constitute 62% of health care use cases [<xref ref-type="bibr" rid="ref13">13</xref>]. The robustness of LLMs is viewed in terms of benchmarks of efficiency and reliability of models, or user experience metrics of applications [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Our work emphasizes the impact of user input variations in real-world health care applications.</p><p>Prior studies have investigated LLM robustness to word-level and typographical perturbations [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>] and have also highlighted their capabilities in language error correction [<xref ref-type="bibr" rid="ref18">18</xref>]. However, evaluating robustness at the intersection of LLMs and health care remains a critical step toward responsible and ethical innovation and adoption in medical settings. This study assesses LLM performance across a range of health-related tasks, including sentiment classification (eg, for patient diarization), medical note classification, and health care&#x2013;focused question answering [<xref ref-type="bibr" rid="ref8">8</xref>]. Notably, this study represents one of the earliest systematic efforts to measure and evaluate LLM robustness to human-like input variations within the health care domain.</p><p>Typographical errors frequently occur when patients input their symptoms or information, often under stress, sickness, or haste. Homophones are common when mediated by verbal communication (eg, text to speech for doctors&#x2019; notes), when written by individuals with limited understanding of medical terms or the English language, or when under cognitive impairments or fatigue [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Redaction is routinely used to protect sensitive patient information and can also happen under stress or duress [<xref ref-type="bibr" rid="ref12">12</xref>]. In fact, masking certain words is one of the most common approaches for evaluating LLM model performance. Here, we do not infer intentionality but rather use redaction as a short-hand notation to describe this phenomenon of removing some of the words [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. By focusing on these perturbations, our research aims to provide a targeted and practical assessment of LLM performance in health care settings, making our findings directly interpretable.</p></sec><sec id="s1-3"><title>Research Questions</title><p>This research aims to provide insights into the following research questions (RQs):</p><list list-type="bullet"><list-item><p>RQ1: How do perturbations in user input impact the performance of LLMs in health contexts?</p></list-item><list-item><p>RQ2: What are the relative impacts of different kinds of perturbations on LLMs in health contexts?</p></list-item><list-item><p>RQ3: What are the relative impacts of perturbations on health-related terms as opposed to other terms on LLM performance in health-related tasks?</p></list-item></list></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>We designed an experiment with 3 commonly occurring human variations over 3 medically relevant tasks for 3 different LLMs. We introduced various perturbation levels to input texts for each of the 3 tasks and examined the response from each of the LLMs [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. For baseline comparisons, we also measured the LLM responses to the original unperturbed texts.</p><p>We evaluated the robustness of LLMs across 3 different medical tasks. The first task is a classification task that required the LLM to identify the sentiment of a given sentence. The second task is also a classification task in which the LLM determined a disease condition from a medical abstract given [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. The sentiment task has 2 categories, and medical condition has 3 categories. This design is intended to evaluate the accuracy of the classification using both binary and ternary categories. The third task is a question-and-answer task in which the LLM responded to a question about a medical note provided [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>The experiment workflow is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The input data obtained from each of the datasets is perturbed and packaged within the prompt. This prompt is sent to the LLM via an application programming interface (API), and the responses are recorded. These responses are compared and evaluated based on the ground truth available in the original database. The settings for each of the LLMs are summarized in part C in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>A summary of the experimental design across dimensions (perturbation types, perturbation levels, tasks, and large language models [LLMs]) and the workflow leading from dataset generation, collection of data, pre-processing, and analysis. BERT: Bidirectional Encoder Representation from Transformers; BLEU: Bilingual Evaluation Understudy; ROUGE: Recall-Oriented Understudy for Gisting Evaluation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e83640_fig01.png"/></fig></sec><sec id="s2-2"><title>Perturbations</title><p>End users and patients often turn to general-purpose tools, such as search engines and publicly available chatbots&#x2014;even when more specialized health care models exist&#x2014;for health-related information-seeking and decision support [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. To reflect this real-world usage pattern, we selected both general-purpose and domain-specific LLMs for evaluation, with a focus on their utility in health-related tasks.</p><p>We perturbed the input text provided to the LLMs using 3 techniques intended to simulate typical human-based perturbations: first, we introduced typographical errors; second, we used homophones to replace words in the text (eg, disease vs decease); and third, we removed some words from the text [<xref ref-type="bibr" rid="ref21">21</xref>]. The examples of such perturbations are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Perturbation types used in the study with specific examples for each perturbation that is applied to the original sample.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Original</td><td align="left" valign="bottom">How did<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> voxelotor affect<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> the patient&#x2019;s scleral icterus<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> and overall quality of life<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> in<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> the given discharge<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> summary?</td></tr></thead><tbody><tr><td align="left" valign="top">Typographical</td><td align="left" valign="top">How did voxelotor affec<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> the patient&#x2019;s scleral icterus and overall quality of life in the given dicsharge<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> summary?</td></tr><tr><td align="left" valign="top">Homophone</td><td align="left" valign="top">How deed<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> voxelotor affect the patient&#x2019;s scleral icterus and overall quality of life inn<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> the given discharge summary?</td></tr><tr><td align="left" valign="top">Redaction</td><td align="left" valign="top">How did voxelotor affect the patient&#x2019;s scleral <underline/>and overall quality of <underline/>in the given discharge summary?</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Homophone.</p></fn><fn id="table1fn2"><p><sup>b</sup>Typographical error.</p></fn><fn id="table1fn3"><p><sup>c</sup>Redaction.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Perturbation Levels</title><p>Different levels of perturbations elicit different responses from LLMs. Each perturbation technique is applied as a percentage of the textual data to systematically investigate the models&#x2019; robustness. In this study, we use 3 levels of perturbation: low, medium, and high. The operationalization for each type of perturbation is described below.</p></sec><sec id="s2-4"><title>Typographical Errors</title><p>Perturbations are introduced as a percentage of the total number of characters. We test typographical errors at 10%, 30%, and 50%, which we refer to as low, medium, and high perturbation levels, respectively.</p></sec><sec id="s2-5"><title>Homophone Substitutions</title><p>These perturbations are applied as a percentage of the total number of words, specifically at 10%, 20%, and 30%, which we refer to as low, medium, and high, respectively. Given that not all words have homophones, these lower percentages are realistic for evaluating the effects of homophone confusion in medical queries. We consider the first level of perturbation (eg, 10%) in each category to be low and the next 2 levels to be medium and high, respectively. By systematically varying these perturbation levels, we aim to understand how LLMs handle common errors in medical contexts, thereby assessing their practical utility and reliability in real-world health care applications.</p></sec><sec id="s2-6"><title>Redactions</title><p>Perturbations are implemented as a percentage of the total number of words. We examine the impact of word redaction at the 10%, 30%, and 50% levels, simulating scenarios where critical information might be partially obscured or omitted in medical documentation. Similar to typographical errors, we refer to these levels as low, medium, and high.</p></sec><sec id="s2-7"><title>Dataset Creation</title><p>We selected 3 datasets as the tasks for this study: tweet emotions [<xref ref-type="bibr" rid="ref27">27</xref>], medical abstracts [<xref ref-type="bibr" rid="ref28">28</xref>], and question answering [<xref ref-type="bibr" rid="ref29">29</xref>]. Our perturbation process targets semantically valid words, which we define based on their part-of-speech tags. Specifically, we consider adjectives, adverbs, verbs, and nouns as valid words for perturbation. Since sentences vary in the number of valid words they contain, we adjusted the perturbation to ensure that a specific percentage of words in each sentence is perturbed while only targeting valid words. For example, in a sentence containing 10 words, of which 5 are valid, a 10% perturbation rate corresponds to perturbing 1 valid word, effectively 20% of the valid words in that sentence.</p></sec><sec id="s2-8"><title>Perturbation Techniques</title><p>After identifying the words to perturb, we applied one of the following transformations:</p><list list-type="bullet"><list-item><p>Typographical perturbations: insertion, deletion, substitution, or transposition of letters in the chosen word. We randomly perturbed 20%-50% of the letters in a word.</p></list-item><list-item><p>Homophone substitution: used the Datamuse API to retrieve potential homophones [<xref ref-type="bibr" rid="ref22">22</xref>]. For each valid word, the first available homophone replaced the original word</p></list-item><list-item><p>Redaction: removal of specified percentage of the selected valid words</p></list-item></list><p>After applying perturbations, we retained only examples that could generate all specified perturbation percentages (eg, 10%, 30%, 50%). These perturbed representations are combined into a new dataset called Health-LLM-Perturbation Dataset that we will be sharing with the community to support similar research by others.</p></sec><sec id="s2-9"><title>Training and Testing Splits</title><p>For generative language model experiments, we split the data as follows:</p><list list-type="bullet"><list-item><p>Prompt tuning: 20% of the dataset</p></list-item><list-item><p>Testing: randomly selected 1000 samples from the 80%</p></list-item><list-item><p>Validation: remaining of 80% after test set extraction</p></list-item></list><p>For Bidirectional Encoder Representation from Transformers (BERT)&#x2013;based language model experiments, we randomized the dataset, excluding the same 1000 test samples, and divided the remaining data into an 80% training set and a 20% validation set. In question-answer experiments, even if a medical note had multiple associated questions in the original dataset, we selected only one of those questions to include in our dataset. Training, validation, and test sets were split based on unique medical notes to ensure no overlap between the sets.</p></sec><sec id="s2-10"><title>Design of Prompt</title><p>The conversational prompt may take various forms depending on the medical task. We adopted a few-shot (n=5) prompting approach in this work. Few-shot prompting is a technique to guide LLMs to perform specific tasks by providing a few examples (or &#x201C;shots&#x201D;) within the prompt. The exact prompts for each of the tasks are given in part A of <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. The basic structure of a 5-shot prompt consisted of a role assignment, 5 examples, specific instructions, input data, and other instructions (if necessary). In the case of classification tasks, this basic structure was followed. For the question-and-answer tasks, we additionally instructed the LLM to provide an answer based only on the medical note provided. With an API call to the LLM, each data text was tested within its own session. However, when testing manually, the LLMs sometimes responded based on previous prompts, especially if the medical notes were similar. As a precautionary measure, to ensure separation across medical cases, specific instructions were included to prevent LLM from being contaminated by short-term memory. To improve the accuracy of the response, we instructed the LLM to provide a short justification, and in case no justification could be provided, the LLM was instructed to respond accordingly. Asking the LLM to provide an explanation for the response helped the LLM provide a more accurate answer [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s2-11"><title>Metrics for Evaluation</title><p>For classification tasks, such as sentiment and medical condition, 4 metrics were used for evaluation: accuracy, recall, precision, and F1. For the question-answer task, distance measures such as BLEU (Bilingual Evaluation Understudy) [<xref ref-type="bibr" rid="ref30">30</xref>], ROUGE (Recall-Oriented Understudy for Gisting Evaluation) [<xref ref-type="bibr" rid="ref31">31</xref>], and BERTScore (Bidirectional Encoder Representation from Transformers) [<xref ref-type="bibr" rid="ref32">32</xref>] were used [<xref ref-type="bibr" rid="ref33">33</xref>]. Although the 3 measures were used, only one of the measures would be used for analysis because averaging the 3 different measures would not be meaningful and introduce noise. The purpose of collecting data was to see if the patterns were different for the 3 measures. For researchers interested in any of the 3 measures, we make the results available in Tables S7 and S8 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>We evaluated the performance in terms of variation from the original average. For ease of interpretation, we split them into the following 4 categories [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]:</p><list list-type="bullet"><list-item><p>Increase: performance increases (rather than decreases) with perturbations</p></list-item><list-item><p>Stable: performance decreases less than 5% of original</p></list-item><list-item><p>Decrease: performance decreases more than 5% but less than 50%</p></list-item><list-item><p>Catastrophic drop: performance decreases less than 50% of original</p></list-item></list><p>We used the statistical package of Microsoft Excel to perform statistical tests. To examine the effect of perturbations RQ1, we aggregated the results and determined the proportions of the levels of impact of perturbations. Then, we performed a Pearson chi-squared test to check if perturbation effects were independent across tasks and LLMs. To examine the relative impact of perturbations across dimensions RQ2, we conducted repeated measures ANOVA across tasks, perturbations, perturbation levels, and LLMs, which is reported in Table S9 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. In order to strengthen our numerical analysis by assuming nonnormal data, we conducted a nonparametric Friedman chi-square test [<xref ref-type="bibr" rid="ref37">37</xref>] across groups for each dimension, followed by post hoc tests using pairwise Conover tests with the Bonferroni correction. To examine the effect of perturbations on medical terms RQ3, we compiled the proportions of perturbed text that were medical terms and then evaluated the effects.</p></sec><sec id="s2-12"><title>Ethical Considerations</title><p>All study procedures complied with OpenAI&#x2019;s usage policies and ethical guidelines. No personal or sensitive information was used or generated during the research, and all data were securely stored with access restricted to authorized research personnel. No human subject was involved, and all data used were based on publicly available sources. Therefore, an ethics approval was not sought. The overarching goal of this study is to contribute to the development of health LLMs that are safe and equitable across input variability. By systematically evaluating the impact of different levels of input variability on health LLMs, this exploratory work aims to support more inclusive and socially beneficial AI systems.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>RQ1: Robustness to Perturbations</title><p>The performance observed for various perturbations for each task and at each level of perturbation is shown in Tables S1-S8 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. In <xref ref-type="table" rid="table2">Table 2</xref>, we summarize the findings by groups as defined in the <italic>Methods</italic> section. For the question-answer task, all 3 measures of BLEU, ROUGE, and BERTScore had similar patterns. For purposes of analysis, we used ROUGE because these values were approximately between the other 2 measures and would greatly simplify our analysis without unnecessary complexity.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>The effect of perturbations across different tasks for each large language model (LLM) is shown as 4 levels of robustness ranging from minimal (increase, stable) to decrease and catastrophic<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">LLMs per task</td><td align="left" valign="bottom">Increase</td><td align="left" valign="bottom">Stable</td><td align="left" valign="bottom">Decrease</td><td align="left" valign="bottom">Catastrophic</td><td align="left" valign="bottom">Total</td></tr></thead><tbody><tr><td align="left" valign="top">Total, n (%)</td><td align="char" char="." valign="top">38 (14.07)</td><td align="char" char="." valign="top">113 (41.85)</td><td align="char" char="." valign="top">104 (38.52)</td><td align="char" char="." valign="top">15 (5.56)</td><td align="char" char="." valign="top">270 (100)</td></tr><tr><td align="left" valign="top" colspan="6">Sentiment, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">6</td><td align="left" valign="top">2</td><td align="left" valign="top">19</td><td align="left" valign="top">9</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BlueBERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0</td><td align="left" valign="top">16</td><td align="left" valign="top">20</td><td align="left" valign="top">0</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="left" valign="top">0</td><td align="left" valign="top">19</td><td align="left" valign="top">14</td><td align="left" valign="top">3</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top" colspan="6">Medical abstract, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT</td><td align="left" valign="top">13</td><td align="left" valign="top">15</td><td align="left" valign="top">5</td><td align="left" valign="top">3</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BlueBERT</td><td align="left" valign="top">0</td><td align="left" valign="top">28</td><td align="left" valign="top">8</td><td align="left" valign="top">0</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="left" valign="top">0</td><td align="left" valign="top">15</td><td align="left" valign="top">21</td><td align="left" valign="top">0</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top" colspan="6">Question answer, n</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT</td><td align="left" valign="top">15</td><td align="left" valign="top">1</td><td align="left" valign="top">11</td><td align="left" valign="top">0</td><td align="left" valign="top">27</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="left" valign="top">4</td><td align="left" valign="top">17</td><td align="left" valign="top">6</td><td align="left" valign="top">0</td><td align="left" valign="top">27</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>These robustness bins allow the results to distinguish catastrophic effects from other robustness bins. This interpretation provides a practical approach to evaluating robustness according to existing literature. LLMs are robust to perturbations because only about 6% (4/72) of the perturbed cases resulted in catastrophic failure.</p></fn><fn id="table2fn2"><p><sup>b</sup>GPT: generative pretrained transformer.</p></fn><fn id="table2fn3"><p><sup>c</sup>BERT: Bidirectional Encoder Representation from Transformers.</p></fn></table-wrap-foot></table-wrap><p>Our first observation was that the impact of perturbations was far from uniform or monotonic. While the performance on average decreased with perturbation, some notable findings are as follows:</p><list list-type="bullet"><list-item><p>In 38 (14.07%; nearly 1 in 7) out of 270 scenarios, the performance of LLMs improved rather than decreased with perturbations.</p></list-item><list-item><p>In more than 40% (113/270, 41.85%) of the scenarios, the performance of LLMs remained stable. Thus, in more than half (151/270, 55.92%) of the cases, the performance was stable or improved.</p></list-item><list-item><p>In 15 (5.56%) out of the 270 scenarios, the performance of LLMs dropped to catastrophically poor levels. Most (12/15) of these catastrophic drops occurred when using generative pretrained transformer rather than BlueBERT or Llama.</p></list-item></list><p>The chi-squared tests across LLMs (<italic>&#x03C7;</italic><sup>2</sup><sub>6</sub>=81.25, <italic>P</italic>&#x003C;.001) and tasks (<italic>&#x03C7;</italic><sup>2</sup><sub>6</sub>=44.74, <italic>P</italic>&#x003C;.001) were both significant.</p><p>Next, we compared the impact of perturbations across the various dimensions by examining the variance of means. We performed a repeated measures ANOVA across the various dimensions, all of which were significant (part D of <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, <xref ref-type="table" rid="table1">Table 1</xref>), and then followed up with post hoc <italic>t</italic> tests. In order to strengthen the numerical analysis in nonnormal conditions, we performed separate Friedman [<xref ref-type="bibr" rid="ref37">37</xref>] chi-squared (<italic>Q</italic>) tests across the groups for perturbation types (Friedman <italic>&#x03C7;</italic><sup>2</sup><sub>2</sub>=8.66, <italic>P</italic>=.01), for LLMs (Friedman <italic>&#x03C7;</italic><sup>2</sup><sub>2</sub>=18.428, <italic>P</italic>&#x003C;.001), for task types (Friedman <italic>&#x03C7;</italic><sup>2</sup><sub>2</sub>=7, <italic>P</italic>=.03), and for perturbation levels (Friedman <italic>&#x03C7;</italic><sup>2</sup><sub>2</sub>=26.547, <italic>P</italic>&#x003C;.001). Since all 4 tests are significant, we performed post hoc tests using pairwise tests using the Conover with Bonferroni correction. The results of the Friedman test and post hoc tests are given in <xref ref-type="table" rid="table3">Table 3</xref>. Two of the effects show no effect, and the rest are either medium or large effects.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Robustness to perturbations across repeated measures of each dimension (perturbation types, perturbation levels, tasks, and large language models [LLMs]) is examined using nonparametric Friedman chi-square tests over groups showing significant variation across groups in each dimension<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Group 1</td><td align="left" valign="bottom">Group 2</td><td align="left" valign="bottom" colspan="3">Pairwise Conover tests using Bonferroni correction</td><td align="left" valign="bottom">Friedman <italic>&#x03C7;</italic><sup>2</sup> test (<italic>df</italic>)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom"><italic>t</italic> value</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">Effect size (<italic>r</italic>)</td><td align="left" valign="bottom"/><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">Perturbation types</td><td align="left" valign="top">8.66 (2)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Typographical</td><td align="left" valign="top">Homophone</td><td align="left" valign="top">0.006</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">0.0009</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Homophone</td><td align="left" valign="top">Redaction</td><td align="left" valign="top">0.88</td><td align="left" valign="top">.02</td><td align="left" valign="top">0.128</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Redaction</td><td align="left" valign="top">Typographical</td><td align="left" valign="top">0.877</td><td align="left" valign="top">.03</td><td align="left" valign="top">0.127</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5">Perturbation levels</td><td align="left" valign="top">26.547 (2)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Original</td><td align="left" valign="top">Low</td><td align="left" valign="top">0.84</td><td align="left" valign="top">.002</td><td align="left" valign="top">0.099</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Low</td><td align="left" valign="top">Medium</td><td align="left" valign="top">1.45</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.17</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medium</td><td align="left" valign="top">Original</td><td align="left" valign="top">2.29</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.27</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5">Tasks</td><td align="left" valign="top">7 (2)</td><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sentiment</td><td align="left" valign="top">Medical abstract</td><td align="left" valign="top">1.3</td><td align="left" valign="top">.46</td><td align="left" valign="top">0.2</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medical abstract</td><td align="left" valign="top">Question answer</td><td align="left" valign="top">3.1</td><td align="left" valign="top">.02</td><td align="left" valign="top">0.51</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Question answer</td><td align="left" valign="top">Sentiment</td><td align="left" valign="top">2.003</td><td align="left" valign="top">.46</td><td align="left" valign="top">0.309</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="5">LLMs</td><td align="left" valign="top">18.428 (2)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">BlueBERT<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">4.32</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.67</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BlueBERT</td><td align="left" valign="top">Llama</td><td align="left" valign="top">1.22</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">0.19</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="left" valign="top">GPT</td><td align="left" valign="top">3.11</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">0.48</td><td align="left" valign="top"/><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Pairwise post hoc tests across groups using Conover tests with the Bonferroni correction showing magnitude, significance, and effect size for the ranked tests (<italic>r</italic>). The results show significant differences between pairs in each dimension with medium-to-large effect size. Specifically, (1) robustness to typographical errors and homophones is similar, (2) perturbation levels have significant effects, (3) robustness between medical abstract and question and answer is significantly different with large effect, and (4) robustness across large language models (generative pretrained transformer vs Llama and generative pretrained transformer vs BlueBERT) is significantly different with a large effect.</p></fn><fn id="table3fn2"><p><sup>b</sup>GPT: generative pretrained transformer.</p></fn><fn id="table3fn3"><p><sup>c</sup>BERT: Bidirectional Encoder Representation from Transformers.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>RQ2: Relative Impact of Perturbations Across Dimensions</title><p>Here, we study how the impact of perturbation varies across types of perturbation. As shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>, all the tasks are most prominently affected by redaction. Although the question-answer task is affected more by homophone perturbations than by typographical errors, both sentiment and medical abstract tasks are affected more by typographical errors than by homophones.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>A color-coded bar chart showing the effect of 3 levels of perturbation (low, medium, and high levels) as a percentage of change for each task (blue color for medical abstract, red color for question answer, and yellow for sentiment) compared to the original unperturbed data across different types of perturbation (homophone, redaction, and typographical).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e83640_fig02.png"/></fig><p><xref ref-type="table" rid="table4">Table 4</xref> shows that typographical errors and homophone-based perturbations have relatively limited impact, whereas redaction-based perturbations lead to a much more pronounced degradation in performance, especially in terms of the increased number of catastrophic responses.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>The effect of perturbation types is shown as 4 levels of robustness ranging from minimal (increase, stable) to decrease and catastrophic<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Perturbation types</td><td align="left" valign="bottom">Increase</td><td align="left" valign="bottom">Stable</td><td align="left" valign="bottom">Decrease</td><td align="left" valign="bottom">Catastrophic</td><td align="left" valign="bottom">Total</td></tr></thead><tbody><tr><td align="left" valign="top">Total, n (%)</td><td align="left" valign="top">5 (6.9)</td><td align="left" valign="top">35 (48.6)</td><td align="left" valign="top">28 (38.9)</td><td align="left" valign="top">4 (5.6)</td><td align="left" valign="top">72 (100)</td></tr><tr><td align="left" valign="top">Typographical, n</td><td align="left" valign="top">4</td><td align="left" valign="top">11</td><td align="left" valign="top">9</td><td align="left" valign="top">0</td><td align="left" valign="top">24</td></tr><tr><td align="left" valign="top">Homophone, n</td><td align="left" valign="top">1</td><td align="left" valign="top">16</td><td align="left" valign="top">7</td><td align="left" valign="top">0</td><td align="left" valign="top">24</td></tr><tr><td align="left" valign="top">Redaction, n</td><td align="left" valign="top">0</td><td align="left" valign="top">8</td><td align="left" valign="top">12</td><td align="left" valign="top">4</td><td align="left" valign="top">24</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>These robustness bins allow the results to distinguish catastrophic effects from other robustness bins. This interpretation provides a practical approach to evaluating robustness according to existing literature. The results show that large language models are, in general, robust to low-to-medium level of perturbations for typographical errors and homophones. However, medium levels of redactions can trigger large language models to catastrophic results.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>RQ3: Differences in the Impact of Medical Versus Nonmedical Terms</title><p>Next, we consider the relative impact of the perturbations on a medical term versus a nonmedical term in the user prompt. The trends varied across settings and were not obvious in aggregate terms. To explore the nuanced effect of perturbation on medical terms, we examined the performances across different dimensions.</p><p><xref ref-type="fig" rid="figure3">Figure 3</xref> shows some variation in median ROUGE scores when measured by the percentage of perturbed medical terms in the LLM prompt. For low perturbations (10% perturbation level), there is visible variation. However, for medium and high levels of perturbations, the variation is not apparent. In general, the performance shows a decreasing trend with a higher ratio of medical words being perturbed.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Variation in median ROUGE (Recall-Oriented Understudy for Gisting Evaluation) values observed across buckets of percentage of medical words perturbed for different perturbation levels (blue color for low perturbation level, red for medium perturbation level, and yellow for high perturbation level) showing higher variability for low perturbation levels and a decreasing trend with higher ratio of medical words being perturbed.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e83640_fig03.png"/></fig><p>We also conducted preliminary error analysis to understand the patterns in the data and interpret some of the macro observations identified above. This analysis was undertaken with the manual inspection of samples used in the classification of medical conditions.</p><p>At a macro level, consistent with the findings in <xref ref-type="table" rid="table2">Table 2</xref>, we found that minor perturbations often did not impact the model&#x2019;s performance. In fact, in some cases, performance improved slightly at lower perturbation levels. This robustness was particularly evident when perturbations occurred in words that were less critical to the classification task, such as prepositions (&#x201C;in&#x201D; or &#x201C;off&#x201D;) that did not alter the medical context or disease description. These changes typically had negligible effects on the output, and the performance remained stable.</p><p>However, when perturbations targeted key medical terms critical for classification, the model&#x2019;s performance declined, as expected. Interestingly, instances arose where both the LLM and a human nonexpert could become confused between 2 similar medical conditions, especially when there was overlapping information in the description. In such cases, if the perturbation affected the distractor (ie, the secondary, less relevant condition), the performance improved as the model&#x2019;s focus shifted back to the correct classification. Conversely, when the perturbation impacted the primary condition, the degradation in performance was not as severe at low perturbation levels, as the LLM was already likely to be uncertain due to the inherent ambiguity of the input.</p><p>The following medical abstract is classified as a &#x201C;Digestive System Disease&#x201D; as per the ground truth in the dataset:</p><list list-type="bullet"><list-item><p>Classification: 2</p></list-item><list-item><p>Medical abstract: Carcinoma of the gallbladder. Gallbladder cancer remains difficult to diagnose preoperatively. However, recent work suggests that ultrasound may be effective. Gallbladder cancer remains highly lethal despite aggressive therapy. Extension of the disease beyond the mucosa predicts a poor chance of long-term survival.</p></list-item></list><p>Even without any perturbation, the LLM incorrectly classified this as &#x201C;Neoplasm&#x201D; and failed to provide adequate justification for the misclassification. Notably, when key medical terms such as &#x201C;carcinoma&#x201D; or &#x201C;cancer&#x201D; were perturbed, the LLM correctly classified the abstract as a &#x201C;Digestive System Disease.&#x201D; This observation supports the hypothesis that low-level perturbations can impact distractors and potentially enhance LLM performance in cases where multiple potential medical conditions exist.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our study systematically evaluated the robustness of LLMs to different types of perturbations in health care contexts. The results reveal several insights into how LLMs perform under various conditions, which are discussed in relation to our RQs.</p><sec id="s4-1-1"><title>RQ1: Robustness to Perturbations</title><p>Perturbations affect LLM performance in nonuniform, nonmonotonic ways. While performance generally declines with increased perturbations, 1 in 7 cases showed improvement, and over half remained stable or improved, indicating resilience to certain input variations. However, occasional severe drops, especially with ChatGPT, underscore the need for stronger robustness measures. These findings highlight the importance of testing LLMs under realistic conditions for reliable health care use.</p></sec><sec id="s4-1-2"><title>RQ2: Relative Impact of Different Perturbations</title><p>The effect of perturbations varies by task and type. Redaction had the most detrimental impact, as it disrupts context more than typos or homophones. This is a significant concern in health care, where patients may redact data for privacy or due to cognitive lapses. Such disruptions can reduce accuracy in clinical and educational applications, emphasizing the need for LLMs resilient to missing information. Homophones most affected question-answer tasks, while typographical errors impacted sentiment and medical condition tasks more. This indicates the need for task-specific training to enhance robustness.</p></sec><sec id="s4-1-3"><title>RQ3: Medical vs Nonmedical Terms</title><p>At low perturbation levels, LLMs were more sensitive to medical term disruptions, underscoring the importance of precise medical language processing. At higher perturbation levels, performance degraded uniformly across term types, suggesting that excessive noise compromises overall model reliability. Ensuring robustness to both medical and general vocabulary is therefore essential for health care applications.</p></sec></sec><sec id="s4-2"><title>Implications for Health Informatics</title><p>The implications of our findings for health informatics are substantial. Ensuring the robustness of LLMs to real-world variations in input can significantly enhance their effectiveness and reliability in health care applications. This robustness is crucial for clinical decision support, patient education, and medical question answering, where accurate and dependable responses are paramount. While the general trend of robustness is encouraging, the potential for catastrophic drops is alarming. The variegated impact of different types of perturbations on performance in different settings can also inform the design of future health LLM systems on key aspects to prioritize building robust health LLMs. An incidental implication of our findings from error analysis of medical terms is the importance of medical experts in medical diagnosis and the use of LLMs as auxiliary support tools in health care. Further, the results suggest that designing ensemble methods that can combine the responses of multiple similar queries (some of which are perturbed versions of the original query) can be a useful pathway to increase LLM response accuracy. Further, our contribution to sharing a novel health-centric dataset with different types of human errors and levels of perturbation provides a valuable resource for further research. By making this dataset available, we aim to stimulate more research at the intersection of robustness and health LLMs, ultimately contributing to better patient outcomes and more efficient health care systems.</p><p>In alignment with the use cases of LLMs supporting clinical decision-making [<xref ref-type="bibr" rid="ref10">10</xref>], our study has given empirical evidence of robustness levels in response to perturbation types and perturbation levels for LLMs across a variety of medical tasks. Sentiment analysis is an important component of clinical decision-making for both clinicians [<xref ref-type="bibr" rid="ref38">38</xref>] and patients [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Autonomous systems that involve triaging patients and screening referrals similarly make use of question-answer tasks and medical condition summaries. This study also adds to the literature on the axis of human&#x2013;artificial intelligence synergy and bias and data issues, which are 2 of the key dimensions of health care AI literature [<xref ref-type="bibr" rid="ref41">41</xref>]. Beyond accuracy, error-tolerant prompts, and redaction-aware design, safe and reliable health care LLMs are contingent upon contextual cues, safety guardrails, and regulatory checks in routine care.</p></sec><sec id="s4-3"><title>Limitations</title><p>While our study provides valuable insights into LLM robustness in health care contexts, it has several limitations that we plan to address in future work. First, the perturbations used&#x2014;typographical errors, homophones, and redactions&#x2014;were synthetically generated to simulate common real-world variations. Although representative and a useful starting point, these do not capture the full range of linguistic and contextual variability seen in actual clinical and patient-generated text. In future work, we will expand the scope of perturbations to include additional forms, such as abbreviations, syntactic reordering, and multilingual input. We also aim to incorporate real-world user data to better reflect the diversity of health care communication. Additionally, our current evaluation focused on 3 LLMs; extending this analysis to a broader set of models will help generalize our findings across architectures and domains.</p><p>We are also actively exploring practical mitigation strategies for real-world deployment, including interactive input filters and automated detection of high-risk queries. These efforts aim to enhance the safety and reliability of LLM-powered health care tools. Importantly, we are releasing our novel perturbation dataset to the research community, providing a valuable resource for benchmarking and advancing robustness in health-related natural language processing applications.</p><p>Despite the limitations, this study represents a significant step forward in understanding how LLMs perform under realistic input conditions. By identifying key vulnerabilities and sharing tools to address them, we aim to catalyze the development of more resilient, equitable, and trustworthy AI systems in health care.</p></sec><sec id="s4-4"><title>Conclusion</title><p>This study provides a comprehensive evaluation of the robustness of LLMs to multiple perturbations in health care contexts, specifically typographical errors, homophones, and redactions, revealing differing levels of resilience across health-related tasks. While LLMs exhibit adaptability to some input variations, redaction-based perturbations significantly impair their contextual understanding. These findings emphasize the necessity of robust evaluation frameworks that mirror real-world input variations to ensure the reliability of LLMs in applications, such as clinical decision support, patient education, and medical question answering. Our contribution to a health care&#x2013;specific dataset with diverse perturbations aims to advance research by fostering the development of more resilient LLMs. Future research should explore additional perturbation types, include a broader range of LLMs, and incorporate diverse user interactions to better simulate real-world scenarios, ultimately driving the creation of dependable and impactful health informatics systems.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This work was funded in part by the Rutgers University School of Communication and Information.</p></sec><sec><title>Data Availability</title><p>The input perturbation dataset is available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Summarized data are available in part B of <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec></notes><fn-group><fn fn-type="con"><p>SJ, MW, and VKS were involved in conceptualization, formal analysis, and validation. SJ, MM, SM, and MW were involved in the investigation and software. Data curation was performed by MW. SJ and VKS were involved in the analysis and methods. SJ, MW, and VKS wrote the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>Bidirectional Encoder Representation from Transformers</p></def></def-item><def-item><term id="abb3">BLEU</term><def><p>Bilingual Evaluation Understudy</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item><def-item><term id="abb6">RQ</term><def><p>research question</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hargittai</surname><given-names>E</given-names> </name></person-group><article-title>Hurdles to information seeking: spelling and typographical mistakes during users&#x2019; online behavior</article-title><source>J Assoc Inf Syst</source><year>2006</year><volume>7</volume><issue>1</issue><fpage>52</fpage><lpage>67</lpage><pub-id pub-id-type="doi">10.17705/1jais.00076</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hasan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Heger</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mansour</surname><given-names>S</given-names> </name></person-group><article-title>Spelling correction of user search queries through statistical machine translation</article-title><conf-name>Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Sep 17-21, 2015</conf-date><pub-id pub-id-type="doi">10.18653/v1/D15-1051</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Crowell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Ngo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lacroix</surname><given-names>EM</given-names> </name></person-group><article-title>A frequency-based technique to improve the spelling suggestion rank in medical queries</article-title><source>J Am Med Inform Assoc</source><year>2004</year><volume>11</volume><issue>3</issue><fpage>179</fpage><lpage>185</lpage><pub-id pub-id-type="doi">10.1197/jamia.M1474</pub-id><pub-id pub-id-type="medline">14764616</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Klotzman</surname><given-names>V</given-names> </name></person-group><article-title>The difficulties of clinical NLP</article-title><source>Engineering Mathematics and Artificial Intelligence Foundations, Methods, and Applications</source><year>2023</year><publisher-name>CRC Press</publisher-name><fpage>413</fpage><lpage>424</lpage><pub-id pub-id-type="doi">10.1201/9781003283980-17</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shahsavar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>A</given-names> </name></person-group><article-title>User intentions to use ChatGPT for self-diagnosis and health-related purposes: cross-sectional survey study</article-title><source>JMIR Hum Factors</source><year>2023</year><month>05</month><day>17</day><volume>10</volume><fpage>e47564</fpage><pub-id pub-id-type="doi">10.2196/47564</pub-id><pub-id pub-id-type="medline">37195756</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cox</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cox</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Cox</surname><given-names>AD</given-names> </name></person-group><article-title>To err is human? How typographical and orthographical errors affect perceptions of online reviewers</article-title><source>Comput Human Behav</source><year>2017</year><month>10</month><volume>75</volume><fpage>245</fpage><lpage>253</lpage><pub-id pub-id-type="doi">10.1016/j.chb.2017.05.008</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Blagec</surname><given-names>K</given-names> </name><name name-style="western"><surname>Dorffner</surname><given-names>G</given-names> </name><name name-style="western"><surname>Moradi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>S</given-names> </name><name name-style="western"><surname>Samwald</surname><given-names>M</given-names> </name></person-group><article-title>A global analysis of metrics used for measuring performance in natural language processing</article-title><conf-name>Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP</conf-name><conf-date>May 27-28, 2022</conf-date><pub-id pub-id-type="doi">10.18653/v1/2022.nlppower-1.6</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kan&#x00E9;</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kocyigit</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ajanoh</surname><given-names>P</given-names> </name><name name-style="western"><surname>Abdalla</surname><given-names>A</given-names> </name><name name-style="western"><surname>Coulibali</surname><given-names>M</given-names> </name></person-group><article-title>Towards neural language evaluators</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 20, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1909.09268</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT&#x2011;4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Magrabi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ammenwerth</surname><given-names>E</given-names> </name><name name-style="western"><surname>McNair</surname><given-names>JB</given-names> </name><etal/></person-group><article-title>Artificial intelligence in clinical decision support: challenges for evaluating AI and practical implications</article-title><source>Yearb Med Inform</source><year>2019</year><month>08</month><volume>28</volume><issue>1</issue><fpage>128</fpage><lpage>134</lpage><pub-id pub-id-type="doi">10.1055/s-0039-1677903</pub-id><pub-id pub-id-type="medline">31022752</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tangsrivimol</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Darzidehkalani</surname><given-names>E</given-names> </name><name name-style="western"><surname>Virk</surname><given-names>HUH</given-names> </name><etal/></person-group><article-title>Benefits, limits, and risks of ChatGPT in medicine</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1518049</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1518049</pub-id><pub-id pub-id-type="medline">39949509</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bullement</surname><given-names>A</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>M</given-names> </name><name name-style="western"><surname>McMordie</surname><given-names>ST</given-names> </name><name name-style="western"><surname>Waters</surname><given-names>E</given-names> </name><name name-style="western"><surname>Hatswell</surname><given-names>AJ</given-names> </name></person-group><article-title>NICE, in confidence: an assessment of redaction to obscure confidential information in single technology appraisals by the National Institute for Health and Care Excellence</article-title><source>Pharmacoeconomics</source><year>2019</year><month>11</month><volume>37</volume><issue>11</issue><fpage>1383</fpage><lpage>1390</lpage><pub-id pub-id-type="doi">10.1007/s40273-019-00818-0</pub-id><pub-id pub-id-type="medline">31250397</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tam</surname><given-names>TYC</given-names> </name><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A framework for human evaluation of large language models in healthcare derived from literature review</article-title><source>NPJ Digit Med</source><year>2024</year><month>09</month><day>28</day><volume>7</volume><issue>1</issue><fpage>258</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01258-7</pub-id><pub-id pub-id-type="medline">39333376</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Nucci</surname><given-names>A</given-names> </name></person-group><article-title>LLM evaluation: key metrics and frameworks</article-title><source>Aisera</source><year>2026</year><access-date>2026-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aisera.com/blog/llm-evaluation/">https://aisera.com/blog/llm-evaluation/</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dhole</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Gangal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Gehrmann</surname><given-names>S</given-names> </name><etal/></person-group><article-title>NL-augmenter: a framework for task-sensitive natural language augmentation</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 6, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2112.02721</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bommasani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Holistic evaluation of language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 16, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2211.09110</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Are large language models really robust to word-level perturbations?</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.11166</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Vatsal</surname><given-names>S</given-names> </name></person-group><article-title>Robustness of large language models to perturbations in text</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 12, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.08989</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Heo</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Song</surname><given-names>M</given-names> </name></person-group><article-title>MLM-based typographical error correction of unstructured medical texts for named entity recognition</article-title><source>BMC Bioinformatics</source><year>2022</year><month>11</month><day>16</day><volume>23</volume><issue>1</issue><fpage>486</fpage><pub-id pub-id-type="doi">10.1186/s12859-022-05035-9</pub-id><pub-id pub-id-type="medline">36384464</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tao</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name></person-group><article-title>Speech sense disambiguation: tackling homophone ambiguity in end-to-end speech translation</article-title><conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1)</conf-name><conf-date>Aug 11-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.435</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Y</given-names> </name></person-group><article-title>RUPBench: benchmarking reasoning under perturbations for robustness evaluation in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 16, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.11020</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Datamuse API</article-title><source>Datamuse</source><year>2025</year><access-date>2026-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.datamuse.com/api/">https://www.datamuse.com/api/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gade</surname><given-names>K</given-names> </name><name name-style="western"><surname>Geyik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kenthapadi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mithal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Taly</surname><given-names>A</given-names> </name></person-group><article-title>Explainable AI in industry: practical challenges and lessons learned</article-title><conf-name>WWW &#x2019;20</conf-name><conf-date>Apr 20-24, 2020</conf-date><conf-loc>Taipei, Taiwan</conf-loc><fpage>303</fpage><lpage>304</lpage><pub-id pub-id-type="doi">10.1145/3366424.3383110</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fabbri</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Kry&#x015B;ci&#x0144;ski</surname><given-names>W</given-names> </name><name name-style="western"><surname>McCann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>C</given-names> </name><name name-style="western"><surname>Socher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Radev</surname><given-names>D</given-names> </name></person-group><article-title>SummEval: re-evaluating summarization evaluation</article-title><source>Trans Assoc Comput Linguist</source><year>2021</year><month>04</month><day>26</day><volume>9</volume><fpage>391</fpage><lpage>409</lpage><pub-id pub-id-type="doi">10.1162/tacl_a_00373</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bhasuran</surname><given-names>B</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Quality of answers of generative large language models versus peer users for interpreting laboratory test results for lay patients: evaluation study</article-title><source>J Med Internet Res</source><year>2024</year><month>04</month><day>17</day><volume>26</volume><fpage>e56655</fpage><pub-id pub-id-type="doi">10.2196/56655</pub-id><pub-id pub-id-type="medline">38630520</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ni</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Applications and concerns of ChatGPT and other conversational large language models in health care: systematic review</article-title><source>J Med Internet Res</source><year>2024</year><month>11</month><day>7</day><volume>26</volume><fpage>e22769</fpage><pub-id pub-id-type="doi">10.2196/22769</pub-id><pub-id pub-id-type="medline">39509695</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Gupta</surname><given-names>P</given-names> </name></person-group><article-title>Emotion detection from text</article-title><source>Kaggle</source><year>2025</year><access-date>2026-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text">https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>Sebischair/medical&#x2011;abstracts&#x2011;TC&#x2011;corpus</article-title><source>GitHub</source><year>2025</year><access-date>2026-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/sebischair/Medical-Abstracts-TC-Corpus">https://github.com/sebischair/Medical-Abstracts-TC-Corpus</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>Starmpcc/asclepius</article-title><source>GitHub</source><year>2023</year><access-date>2026-01-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/starmpcc/Asclepius">https://github.com/starmpcc/Asclepius</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Papineni</surname><given-names>K</given-names> </name><name name-style="western"><surname>Roukos</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ward</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>WJ</given-names> </name></person-group><article-title>BLEU: a method for automatic evaluation of machine translation</article-title><conf-name>Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 7-12, 2002</conf-date><pub-id pub-id-type="doi">10.3115/1073083.1073135</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>CY</given-names> </name></person-group><article-title>ROUGE: a package for automatic evaluation of summaries</article-title><access-date>2026-01-21</access-date><conf-name>Proceedings of the ACL 2004 Workshop: Text Summarization Branches Out</conf-name><conf-date>Jul 21-22, 2004</conf-date><conf-loc>Barcelona, Spain</conf-loc><fpage>74</fpage><lpage>81</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W04-1013/">https://aclanthology.org/W04-1013/</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>BERTScore: evaluating text generation with BERT</article-title><source>OpenReview</source><year>2020</year><access-date>2026-02-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/attachment?id=SkeHuCVFDr&#x0026;name=original_pdf">https://openreview.net/attachment?id=SkeHuCVFDr&#x0026;name=original_pdf</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Schopf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Braun</surname><given-names>D</given-names> </name><name name-style="western"><surname>Matthes</surname><given-names>F</given-names> </name></person-group><article-title>Evaluating unsupervised text classification: zero-shot and similarity-based approaches</article-title><conf-name>Proceedings of the 2022 6th International Conference on Natural Language Processing and Information Retrieval (NLPIR 2022)</conf-name><conf-date>Dec 16-18, 2022</conf-date><pub-id pub-id-type="doi">10.1145/3582768.3582795</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Braiek</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Khomh</surname><given-names>F</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Lorenzi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zuluaga</surname><given-names>MA</given-names> </name></person-group><article-title>Machine learning robustness: a primer</article-title><source>Trustworthy AI in Medical Imaging</source><year>2025</year><publisher-name>Academic Press</publisher-name><fpage>37</fpage><lpage>71</lpage><pub-id pub-id-type="doi">10.1016/B978-0-44-323761-4.00012-2</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>D&#x00FC;nkel</surname><given-names>O</given-names> </name><name name-style="western"><surname>Jesslen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>J</given-names> </name><etal/></person-group><article-title>CNS-bench: benchmarking image classifier robustness under continuous nuisance shifts</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 23, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2507.17651</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name></person-group><article-title>Understanding catastrophic overfitting in single-step adversarial training</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 5, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2010.01799</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sheldon</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Fillyaw</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>WD</given-names> </name></person-group><article-title>The use and interpretation of the Friedman test in the analysis of ordinal-scale data in repeated measures designs</article-title><source>Physiother Res Int</source><year>1996</year><volume>1</volume><issue>4</issue><fpage>221</fpage><lpage>228</lpage><pub-id pub-id-type="doi">10.1002/pri.66</pub-id><pub-id pub-id-type="medline">9238739</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hah</surname><given-names>H</given-names> </name><name name-style="western"><surname>Goldin</surname><given-names>DS</given-names> </name></person-group><article-title>How clinicians perceive artificial intelligence&#x2011;assisted technologies in diagnostic decision making: mixed methods approach</article-title><source>J Med Internet Res</source><year>2021</year><month>12</month><day>16</day><volume>23</volume><issue>12</issue><fpage>e33540</fpage><pub-id pub-id-type="doi">10.2196/33540</pub-id><pub-id pub-id-type="medline">34924356</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shankar</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yip</surname><given-names>A</given-names> </name></person-group><article-title>Transforming patient feedback into actionable insights through natural language processing: knowledge discovery and action research study</article-title><source>JMIR Form Res</source><year>2025</year><month>08</month><day>26</day><volume>9</volume><fpage>e69699</fpage><pub-id pub-id-type="doi">10.2196/69699</pub-id><pub-id pub-id-type="medline">40857725</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shankar</surname><given-names>R</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Bundele</surname><given-names>A</given-names> </name></person-group><article-title>Patient voices in dialysis care: sentiment analysis and topic modeling study of social media discourse</article-title><source>J Med Internet Res</source><year>2025</year><month>05</month><day>15</day><volume>27</volume><fpage>e70128</fpage><pub-id pub-id-type="doi">10.2196/70128</pub-id><pub-id pub-id-type="medline">40372782</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ogut</surname><given-names>E</given-names> </name></person-group><article-title>Artificial intelligence in clinical medicine: challenges across diagnostic imaging, clinical decision support, surgery, pathology, and drug discovery</article-title><source>Clin Pract</source><year>2025</year><month>09</month><day>16</day><volume>15</volume><issue>9</issue><fpage>169</fpage><pub-id pub-id-type="doi">10.3390/clinpract15090169</pub-id><pub-id pub-id-type="medline">41002784</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Input perturbation dataset.</p><media xlink:href="ai_v5i1e83640_app1.zip" xlink:title="ZIP File, 11106 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Prompts, results, settings, and ANOVA results with post hoc tests.</p><media xlink:href="ai_v5i1e83640_app2.pdf" xlink:title="PDF File, 320 KB"/></supplementary-material></app-group></back></article>