<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e82608</article-id><article-id pub-id-type="doi">10.2196/82608</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Model Adaptation Strategies in Speech-Based Cognitive Screening: Systematic Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Taherinezhad</surname><given-names>Fatemeh</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Momeni Nezhad</surname><given-names>Mohamad Javad</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Karimi</surname><given-names>Sepehr</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rashidi</surname><given-names>Sina</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zolnour</surname><given-names>Ali</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dadkhah</surname><given-names>Maryam</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Haghbin</surname><given-names>Yasaman</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Azadmaleki</surname><given-names>Hossein</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Zolnoori</surname><given-names>Maryam</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Columbia University Irving Medical Center</institution><addr-line>622 W, 168th St</addr-line><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff2"><institution>School of Nursing, Columbia University</institution><addr-line>560 W, 168th St</addr-line><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><aff id="aff3"><institution>Data Science Institute, Columbia University</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Yin</surname><given-names>Zhijun</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hao</surname><given-names>Yilun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Zhihong</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Maryam Zolnoori, PhD, School of Nursing, Columbia University, 560 W, 168th St, New York, NY, 10032, United States, 1 212-305-5756; <email>mz2825@cumc.columbia.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>26</day><month>3</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e82608</elocation-id><history><date date-type="received"><day>18</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>16</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Fatemeh Taherinezhad, Mohamad Javad Momeni Nezhad, Sepehr Karimi, Sina Rashidi, Ali Zolnour, Maryam Dadkhah, Yasaman Haghbin, Hossein Azadmaleki, Maryam Zolnoori. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 26.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e82608"/><abstract><sec><title>Background</title><p>Over half of US adults with Alzheimer disease and related dementias (ADRD) remain undiagnosed. Speech-based screening algorithms offer a scalable approach, but the relative value of large language model (LLM) adaptation strategies is unclear.</p></sec><sec><title>Objective</title><p>The study aimed to compare LLM adaptation strategies for cognitive impairment detection across DementiaBank speech datasets using both text-only and multimodal models.</p></sec><sec sec-type="methods"><title>Methods</title><p>We analyzed audio-recorded speech from 237 participants in the ADReSSo subset of DementiaBank (ADRD vs cognitive normal [CN]) and report performance on a held-out test set (n=71). Nine text-only LLMs (3B-405B; open-weight and commercial) and 3 multimodal audio-text models were evaluated. Adaptations included (1) in-context learning (ICL) with 4 demonstration selection strategies (most similar, least similar, average similar or prototype, and random), (2) reasoning-augmented prompting (self- or teacher-generated rationales, self-consistency, tree-of-thought with domain experts), (3) parameter-efficient fine-tuning (token-level vs added classification head), and (4) multimodal audio-text integration. Generalizability of the adaptation strategies was evaluated on the DementiaBank Delaware dataset (n=205; mild cognitive impairment vs CN) using the first 3 strategies. The primary outcome was the <italic>F</italic><sub>1</sub>-score for the cognitive impaired class; the area under the receiver operating characteristic curve was reported when available.</p></sec><sec sec-type="results"><title>Results</title><p>On the ADReSSo dataset, average similar (prototype) demonstrations achieved the highest ICL performance across model sizes (<italic>F</italic><sub>1</sub>-score up to 0.81). Reasoning primarily benefited smaller models: teacher-generated rationales increased LLaMA 8B from <italic>F</italic><sub>1</sub>-score 0.72 to 0.76; expert-role tree-of-thought improved its zero-shot score from 0.65 to 0.71. Token-level fine-tuning produced the highest scores (LLaMA 3B: <italic>F</italic><sub>1</sub>=0.83, 95% CI 0.01, area under the curve [AUC]=0.91; LLaMA 70B: <italic>F</italic><sub>1</sub>=0.82, 95% CI 0.02, AUC=0.86; GPT-4o: <italic>F</italic><sub>1</sub>=0.79, 95% CI 0.01, AUC=0.87). A classification head markedly improved MedAlpaca 7B (<italic>F</italic><sub>1</sub>=0.06, 95% CI 0.02 to <italic>F</italic><sub>1</sub>=0.81, 95% CI 0.04), indicating model-dependent benefits of this approach. Among multimodal models, fine-tuned Phi-4 Multimodal reached an <italic>F</italic><sub>1</sub>-score of 0.80 (cognitive impaired) and 0.75 (CN) but did not exceed the top text-only systems. On the Delaware dataset, ICL achieved a high performance (LLaMA 8B: <italic>F</italic><sub>1</sub>=0.74; GPT-4o: <italic>F</italic><sub>1</sub>=0.80). Reasoning-augmented ICL improved LLaMA 8B to an <italic>F</italic><sub>1</sub>-score of 0.75. Token-level fine-tuning produced the highest scores (LLaMA 8B: <italic>F</italic><sub>1</sub>=0.76, 95% CI 0.02; GPT-4o: <italic>F</italic><sub>1</sub>=0.82, 95% CI 0.03).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Detection accuracy is influenced by demonstration selection, reasoning design, and tuning method. Token-level fine-tuning is generally most effective, while a classification head benefits models that perform poorly under token-based supervision. Properly adapted open-weight models can match or exceed commercial LLMs, supporting their use in scalable speech-based ADRD and mild cognitive impairment screening. Current multimodal models may require improved audio-text alignment and/or larger training corpora.</p></sec></abstract><kwd-group><kwd>cognitive impairment detection</kwd><kwd>speech-based screening</kwd><kwd>large language models adaptation</kwd><kwd>in-context learning</kwd><kwd>reasoning-augmented prompting</kwd><kwd>fine-tuning</kwd><kwd>multimodal speech-text analysis</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Alzheimer disease (AD) and related dementias (ADRD) pose a significant public health challenge, currently affecting approximately 5 million individuals, or 11% of older adults in the United States [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. This number is projected to rise to 13.2 million by 2050 [<xref ref-type="bibr" rid="ref4">4</xref>], underscoring the need for early, scalable detection strategies. Despite national efforts, over half of individuals with ADRD remain undiagnosed and untreated [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. To address this gap, the National Institute on Aging has prioritized the development of accurate, accessible screening tools [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>A promising direction involves natural language processing to analyze spontaneous speech, which may reveal subtle cognitive changes missed by conventional screening [<xref ref-type="bibr" rid="ref9">9</xref>]. Picture description tasks, such as the &#x201C;Cookie Theft&#x201D; scene [<xref ref-type="bibr" rid="ref10">10</xref>], are widely used to elicit language markers of early decline. Prior pipelines follow two main approaches: (1) engineering acoustic and linguistic features [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>] (eg, lexical diversity and syntactic complexity), and (2) fine-tuning transformer encoders such as Bidirectional Encoder Representations from Transformer (BERT) [<xref ref-type="bibr" rid="ref14">14</xref>] (for transcripts) and Wav2Vec 2.0 [<xref ref-type="bibr" rid="ref15">15</xref>] (for raw audio). While both strategies show promise, they require extensive feature engineering and large labeled corpora [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]&#x2014;resources often lacking in clinical settings&#x2014;limiting generalizability across dialects and institutions [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>Large language models (LLMs) offer new opportunities for cognitive impairment detection by modeling complex linguistic patterns, performing few-shot in-context learning (ICL) [<xref ref-type="bibr" rid="ref20">20</xref>], generating reasoning chains, and adapting via fine-tuning. LLMs show strong performance in clinical decision support tasks [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref25">25</xref>], including detection of depression [<xref ref-type="bibr" rid="ref26">26</xref>], anxiety [<xref ref-type="bibr" rid="ref27">27</xref>], suicide risk [<xref ref-type="bibr" rid="ref28">28</xref>], and medication-related errors [<xref ref-type="bibr" rid="ref29">29</xref>]. Applications to cognitive impairment are emerging but remain limited&#x2014;for example, using GPT-4 in zero-shot fluency scoring, GPT-3 embeddings for classification, or comparing GPT-3.5, GPT-4, and Bard [<xref ref-type="bibr" rid="ref30">30</xref>] on DementiaBank [<xref ref-type="bibr" rid="ref31">31</xref>] transcripts. These studies suggest feasibility but lack systematic comparisons of prompting methods, fine-tuning, and multimodal inputs.</p><p>We present the first comprehensive evaluation of state-of-the-art LLMs, including open-weight (LLaMA [<xref ref-type="bibr" rid="ref32">32</xref>], Ministral [<xref ref-type="bibr" rid="ref33">33</xref>], MedAlpaca [<xref ref-type="bibr" rid="ref34">34</xref>], DeepSeek [<xref ref-type="bibr" rid="ref35">35</xref>]) and commercial models (GPT-4o [<xref ref-type="bibr" rid="ref36">36</xref>], Gemini 2.0 Flash [<xref ref-type="bibr" rid="ref37">37</xref>]), for early detection of ADRD using the ADReSSo dataset from DementiaBank [<xref ref-type="bibr" rid="ref38">38</xref>]. Our study comprises four components (<xref ref-type="fig" rid="figure1">Figure 1</xref>): (1) ICL with demonstration selection to assess the impact of different sampling strategies; (2) reasoning-augmented prompting to evaluate whether structured reasoning enhances LLM performance, particularly in smaller models; (3) parameter-efficient fine-tuning to improve classification accuracy beyond prompt-based methods; and (4) evaluation of multimodal LLMs that integrate audio and text to determine the added value of acoustic information. To assess generalizability beyond a single dataset and task, we additionally evaluate 3 components for mild cognitive impairment (MCI) detection on the DementiaBank Delaware dataset [<xref ref-type="bibr" rid="ref39">39</xref>], which includes multiple speech tasks.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study workflow and evaluation framework for LLM-based ADRD detection. Participants complete the Cookie-Theft picture-description task, and their responses are audio-recorded under standardized clinician instructions. Recordings are segmented per speaker and transcribed with AWS. Text-only (transcription) pipeline (orange) includes: (1) In-context learning (ICL) with demonstration selection&#x2014;few-shot examples are drawn from cognitively normal (CN) and cognitively impaired (CI) speakers using four sampling rules (Most Similar, Least Similar, Average Similar, Random); (2) Reasoning-augmented prompting models receive self-generated/teacher rationales, self-consistency voting, or tree-of-thought chains; (3) Parameter-efficient fine-tuning using supervised text-based tuning and addition of a lightweight classification head. Audio-enabled pipeline (purple): Raw speech and its transcript are fed to multimodal / audio LLMs, which directly encode acoustic and linguistic cues before inference, yielding the same binary outcome labels.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e82608_fig01.png"/></fig></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dataset</title><p>This study analyzed audio recordings from the ADReSSo dataset, a subset of DementiaBank [<xref ref-type="bibr" rid="ref31">31</xref>] Pitt Corpus picture-description task (<xref ref-type="table" rid="table1">Table 1</xref>). The dataset contains 237 participants&#x2014;122 cognitive impaired and 115 cognitive normal (CN). Following ADReSSo&#x2019;s original split, 166 participants (n=87 cognitive impaired and n=79 CN) formed the development set, and 71 (n=35 cognitive impaired and n=36 CN) constituted the held-out test set. All diagnoses were made by neurologists or certified cognitive specialists.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of the participants in &#x201C;the ADReSSo dataset.&#x201D;</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Attribute</td><td align="left" valign="bottom" colspan="2">Train</td><td align="left" valign="bottom" colspan="2">Validation</td><td align="left" valign="bottom" colspan="2">Test</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Case</td><td align="left" valign="bottom">Control</td><td align="left" valign="bottom">Case</td><td align="left" valign="bottom">Control</td><td align="left" valign="bottom">Case</td><td align="left" valign="bottom">Control</td></tr></thead><tbody><tr><td align="left" valign="top">Participants (n)</td><td align="left" valign="top">60</td><td align="left" valign="top">56</td><td align="left" valign="top">27</td><td align="left" valign="top">23</td><td align="left" valign="top">35</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top">Gender (F/M)</td><td align="left" valign="top">39/21</td><td align="left" valign="top">37/19</td><td align="left" valign="top">19/8</td><td align="left" valign="top">15/8</td><td align="left" valign="top">21/14</td><td align="left" valign="top">23/13</td></tr><tr><td align="left" valign="top">Age (y), mean (SD)</td><td align="left" valign="top">69.33 (7.14)</td><td align="left" valign="top">66.27 (6.81)</td><td align="left" valign="top">70.59 (6.01)</td><td align="left" valign="top">65.48 (4.72)</td><td align="left" valign="top">68.51 (7.12)</td><td align="left" valign="top">66.11 (6.53)</td></tr><tr><td align="left" valign="top">Age range (y)</td><td align="left" valign="top">53&#x2010;79</td><td align="left" valign="top">54&#x2010;80</td><td align="left" valign="top">60&#x2010;80</td><td align="left" valign="top">56&#x2010;74</td><td align="left" valign="top">56&#x2010;79</td><td align="left" valign="top">56&#x2010;78</td></tr><tr><td align="left" valign="top">Age quartiles (y; 25%, 50%, 75%)</td><td align="left" valign="top">(65, 70, 75)</td><td align="left" valign="top">(60.75, 67, 71.25)</td><td align="left" valign="top">(65, 72, 76.5)</td><td align="left" valign="top">(63.5, 66, 68)</td><td align="left" valign="top">(63, 69, 74)</td><td align="left" valign="top">(61, 66, 70)</td></tr><tr><td align="left" valign="top">MMSE<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, mean (SD)</td><td align="left" valign="top">17.80 (5.04)</td><td align="left" valign="top">29.04 (1.13)</td><td align="left" valign="top">16.63 (5.94)</td><td align="left" valign="top">28.87 (1.22)</td><td align="left" valign="top">18.86 (5.8)</td><td align="left" valign="top">28.91 (1.25)</td></tr><tr><td align="left" valign="top">MMSE range</td><td align="left" valign="top">7&#x2010;28</td><td align="left" valign="top">26&#x2010;30</td><td align="left" valign="top">3&#x2010;27</td><td align="left" valign="top">26&#x2010;30</td><td align="left" valign="top">5&#x2010;27</td><td align="left" valign="top">24&#x2010;30</td></tr><tr><td align="left" valign="top">MMSE quartiles (25%, 50%, 75%)</td><td align="left" valign="top">(14.75, 18, 20)</td><td align="left" valign="top">(28, 29, 30)</td><td align="left" valign="top">(13.5, 17, 20.5)</td><td align="left" valign="top">(28.5, 29, 30)</td><td align="left" valign="top">(16, 20, 24)</td><td align="left" valign="top">(28, 29, 30)</td></tr><tr><td align="left" valign="top">Recording length, mean (SD)</td><td align="left" valign="top">87.20 (48.35)</td><td align="left" valign="top">68.98 (25.85)</td><td align="left" valign="top">88.52 (43.27)</td><td align="left" valign="top">68.25 (25.43)</td><td align="left" valign="top">79.42 (36.79)</td><td align="left" valign="top">66.35 (28.17)</td></tr><tr><td align="left" valign="top">Recording length range</td><td align="left" valign="top">35.26&#x2010;268.49</td><td align="left" valign="top">22.79&#x2010;168.61</td><td align="left" valign="top">39.91&#x2010;219.5</td><td align="left" valign="top">26.16&#x2010;121.47</td><td align="left" valign="top">28.39&#x2010;150.15</td><td align="left" valign="top">22.35&#x2010;135.68</td></tr><tr><td align="left" valign="top">Recording length quartiles (25%, 50%, 75%)</td><td align="left" valign="top">(54.28, 75.93, 99.94)</td><td align="left" valign="top">(52.15, 67.6, 77.8)</td><td align="left" valign="top">(60.01, 80.24, 97.45)</td><td align="left" valign="top">(44.54, 67.77, 82.11)</td><td align="left" valign="top">(51.52, 70.20, 106.97)</td><td align="left" valign="top">(44.4, 66.04, 77.69)</td></tr><tr><td align="left" valign="top">Word count, mean (SD)</td><td align="left" valign="top">82.63 (43.32)</td><td align="left" valign="top">114.43 (78.21)</td><td align="left" valign="top">101.67 (55.49)</td><td align="left" valign="top">111.39 (43.18)</td><td align="left" valign="top">92.49 (57.38)</td><td align="left" valign="top">111.72 (53.86)</td></tr><tr><td align="left" valign="top">Word count range</td><td align="left" valign="top">22&#x2010;189</td><td align="left" valign="top">21&#x2010;523</td><td align="left" valign="top">31&#x2010;284</td><td align="left" valign="top">54&#x2010;197</td><td align="left" valign="top">27&#x2010;256</td><td align="left" valign="top">45&#x2010;243</td></tr><tr><td align="left" valign="top">Word count quartiles (25%, 50%, 75%)</td><td align="left" valign="top">(51.25, 70.5, 106.25)</td><td align="left" valign="top">(67.25, 101, 139.75)</td><td align="left" valign="top">(67, 93, 118)</td><td align="left" valign="top">(78.5, 91, 147)</td><td align="left" valign="top">(50, 70, 120.5)</td><td align="left" valign="top">(63.5, 97, 168.25)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>MMSE: Mini-Mental State Examination.</p></fn></table-wrap-foot></table-wrap><p>A validation set was drawn from the development data via stratified sampling on diagnosis, Mini-Mental State Examination (MMSE) score, gender, and audio duration, yielding 116 training and 50 validation subjects. Recordings were transcribed with Amazon Web Services (AWS) General Transcribe [<xref ref-type="bibr" rid="ref40">40</xref>].</p><p>According to the ADReSSo organizers, each audio file is a description of the &#x201C;Cookie Theft&#x201D; picture from the Boston Diagnostic Aphasia Exam, recorded at a 16 kHz sampling rate. The preprocessing steps included: after speaker diarization (patient-clinician), the clinician&#x2019;s speech was removed; and (2) noise reduction was performed using spectral subtraction and amplitude normalization.</p><p>We denote the transcription of a subject <inline-formula><mml:math id="ieqn1"><mml:mi>i</mml:mi></mml:math></inline-formula> in <inline-formula><mml:math id="ieqn2"><mml:msubsup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, in which <inline-formula><mml:math id="ieqn3"><mml:mi>S</mml:mi></mml:math></inline-formula> represents the subject and the superscript <inline-formula><mml:math id="ieqn4"><mml:mi>C</mml:mi></mml:math></inline-formula> indicates the subject&#x2019;s cognitive status, with <inline-formula><mml:math id="ieqn5"><mml:mi>C</mml:mi><mml:mi mathvariant="normal"> </mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo>{</mml:mo><mml:mi>C</mml:mi><mml:mi>I</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="normal"> </mml:mi><mml:mi>C</mml:mi><mml:mi>N</mml:mi><mml:mo>}</mml:mo></mml:math></inline-formula>.</p><p>Participants were aged 53 years or older; women comprised more than 60% of each group. MMSE scores ranged from 3&#x2010;28 in cognitive impaired (mild-severe impairment) and greater than 24 in CN. CN speakers produced more words on average, whereas cognitive impaired speakers had longer recordings, suggesting slower speech or greater effort (<xref ref-type="table" rid="table1">Table 1</xref>).</p><p>To examine distributional similarity across partitions, we applied <italic>t</italic>-distributed stochastic neighbor embedding to word-level embeddings from the transcripts (<xref ref-type="fig" rid="figure2">Figure 2A</xref>) and to vectorized demographics (age, MMSE, gender, recording length, and word count (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), providing insight into overlap among training, validation, and test sets.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p><italic>t</italic>-SNE visualization of linguistic and demographic feature spaces across dataset splits. (A) 2D <italic>t</italic>-SNE projection of word-level transcript embeddings. Points are color-coded by dataset split (train, validation, and test) and diagnosis (control vs ADRD). The extensive overlap indicates that all partitions occupy a comparable linguistic feature space, minimizing risk of distribution shift. (B) <italic>t</italic>-SNE projection of participant-level metadata vectors combining age, Mini-Mental State Examination, gender, recording duration, and word count. Three natural clusters reflect shared acoustic-demographic profiles, yet samples from every split and label are intermixed within each cluster, confirming balanced coverage of nonlinguistic characteristics across partitions. ADRD: Alzheimer disease and related dementia; <italic>t</italic>-SNE: <italic>t</italic>-distributed stochastic neighbor embedding.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e82608_fig02.png"/></fig></sec><sec id="s2-2"><title>Text-Only LLMs Used in This Study</title><p>We evaluated 9 LLMs spanning diverse model sizes and training objectives. <italic>GPT-4o</italic> (text-only) served as a benchmark, representing a proprietary high-capacity model with advanced language understanding. <italic>LLaMA 3.2 3B Instruct</italic> [<xref ref-type="bibr" rid="ref32">32</xref>] (LLaMA 3B), the smallest model, tested whether lightweight architectures can capture linguistic cues of cognitive impairment. <italic>LLaMA 3.1 8B Instruct</italic> (LLaMA 8B), a mid-sized model, was selected for its balance between efficiency and capacity to detect class-specific patterns. <italic>MedAlpaca 7B</italic> [<xref ref-type="bibr" rid="ref34">34</xref>], fine-tuned on biomedical text, examined whether domain-specific pretraining enhances sensitivity to clinical language. <italic>Ministral 8B</italic> [<xref ref-type="bibr" rid="ref33">33</xref>], optimized for efficient inference and strong text representation, evaluated the performance of general-purpose mid-sized models. <italic>LLaMA 3.3 70B Instruct</italic> (LLaMA 70B) and <italic>LLaMA 3.1 405B Instruct</italic> (LLaMA 405B), large and ultralarge open-weight models, tested the impact of scale on capturing linguistic signals. <italic>Gemini 2.0 Flash</italic> [<xref ref-type="bibr" rid="ref37">37</xref>], a commercial model optimized for low-latency inference and embedded reasoning, was included for its potential to detect cognitive impairment-related cues. <italic>DeepSeek-R1</italic> [<xref ref-type="bibr" rid="ref35">35</xref>], trained on diverse multilingual data, assessed whether alternative training paradigms generalize across speaker populations.</p></sec><sec id="s2-3"><title>LLM Adaptation Strategies for Cognitive Impairment Detection</title><sec id="s2-3-1"><title>Component 1: ICL with Demonstration Selection</title><p>ICL prompts were composed of four elements: an Instruction (<inline-formula><mml:math id="ieqn6"><mml:mi>I</mml:mi><mml:mi>N</mml:mi></mml:math></inline-formula>), a set of demonstrations (<inline-formula><mml:math id="ieqn7"><mml:mi>D</mml:mi><mml:mi>M</mml:mi></mml:math></inline-formula>), the test input (<inline-formula><mml:math id="ieqn8"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>), and the corresponding output label (<inline-formula><mml:math id="ieqn9"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>). The model estimates the conditional probability:</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mi>I</mml:mi><mml:mi>N</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mtext>&#x00A0;</mml:mtext></mml:mrow><mml:mi>D</mml:mi><mml:mi>M</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where each demonstration <inline-formula><mml:math id="ieqn10"><mml:mi>D</mml:mi><mml:msub><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfenced separators="|"><mml:mrow><mml:mi>T</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>n</mml:mi><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi>p</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi mathvariant="normal"> </mml:mi><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:msub><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn11"><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:msub><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mo>{</mml:mo><mml:mi>C</mml:mi><mml:mi>N</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi mathvariant="normal"> </mml:mi><mml:mi>C</mml:mi><mml:mi>I</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>}</mml:mo></mml:math></inline-formula>.</p><p>We began with a zero-shot baseline (ie, <inline-formula><mml:math id="ieqn12"><mml:mi>D</mml:mi><mml:mi>M</mml:mi><mml:mi> </mml:mi><mml:mo>=</mml:mo><mml:mi> </mml:mi><mml:mi>&#x2205;</mml:mi></mml:math></inline-formula>), followed by few-shot experiments with <inline-formula><mml:math id="ieqn13"><mml:mi>N</mml:mi><mml:mi> </mml:mi><mml:mo>=</mml:mo><mml:mi> </mml:mi><mml:mo>{</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>4</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>6</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>8</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>10</mml:mn><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mn>12</mml:mn><mml:mo>}</mml:mo></mml:math></inline-formula> demonstrations. All prompts were standardized in structure and length across models to control for prompt-induced variance. The prompt is presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To examine how the type of demonstrations influences performance, we evaluated four selection strategies: (1) most similar, (2) least similar, (3) average similarity to class prototypes, and (4) random. Each strategy selected <inline-formula><mml:math id="ieqn14"><mml:mi>N</mml:mi><mml:mo>/</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> demonstrations from each class (cognitive impaired and CN) to maintain balance.</p><p>Let <inline-formula><mml:math id="ieqn15"><mml:msubsup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> denote the <italic>i</italic>th transcript from class <inline-formula><mml:math id="ieqn16"><mml:mi>C</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo>{</mml:mo><mml:mi>C</mml:mi><mml:mi>I</mml:mi><mml:mo>,</mml:mo><mml:mi> </mml:mi><mml:mi>C</mml:mi><mml:mi>N</mml:mi><mml:mo>}</mml:mo></mml:math></inline-formula>, and let <inline-formula><mml:math id="ieqn17"><mml:mi>E</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfenced><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> represent its semantic embedding computed using the Beijing Academy of Artificial Intelligence General Embedding transformer model [<xref ref-type="bibr" rid="ref41">41</xref>]. For each test input <inline-formula><mml:math id="ieqn18"><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, we computed its embedding <inline-formula><mml:math id="ieqn19"><mml:mi>E</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></inline-formula>, and calculated cosine similarity with all candidate demonstrations (from the separated training dataset):</p><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mi>S</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi mathvariant="normal"> </mml:mi><mml:mfenced separators="|"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="normal">cos</mml:mi></mml:mrow><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi mathvariant="normal"> </mml:mi><mml:mi>E</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfenced></mml:mrow></mml:mfenced></mml:mrow></mml:mrow></mml:math></disp-formula><p>where the reference embedding <inline-formula><mml:math id="ieqn20"><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> was defined differently for each strategy:</p><list list-type="bullet"><list-item><p>Most similar: <inline-formula><mml:math id="ieqn21"><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>E</mml:mi><mml:mo>(</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:math></inline-formula>. Select the top <inline-formula><mml:math id="ieqn22"><mml:mi>N</mml:mi><mml:mo>/</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> samples per class with highest cosine similarity to the test input.</p></list-item></list><list list-type="bullet"><list-item><p>Least similar: <inline-formula><mml:math id="ieqn23"><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>E</mml:mi><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced></mml:math></inline-formula>. Select the bottom <inline-formula><mml:math id="ieqn24"><mml:mi>N</mml:mi><mml:mo>/</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> samples per class with the lowest similarity to the test input.</p></list-item><list-item><p>Average similar: <inline-formula><mml:math id="ieqn25"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mover><mml:mi>E</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mi>C</mml:mi></mml:msup></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>, where <inline-formula><mml:math id="ieqn26"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:msup><mml:mover><mml:mi>E</mml:mi><mml:mo accent="false">&#x00AF;</mml:mo></mml:mover><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:msup><mml:mi>N</mml:mi><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mi>E</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:msubsup><mml:mi>S</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>. Select the <inline-formula><mml:math id="ieqn27"><mml:mi>N</mml:mi><mml:mo>/</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> samples per class most similar to their class centroid, average of embeddings in each class.</p></list-item><list-item><p>Random: Ignore similarity score and sample <inline-formula><mml:math id="ieqn28"><mml:mi>N</mml:mi><mml:mo>/</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> transcriptions per class uniformly at random.</p></list-item></list><p>Each strategy reflects a different hypothesis about which demonstrations best support generalization and reasoning:</p><list list-type="bullet"><list-item><p>Most similar: examples provide contextual alignment, enhancing sensitivity to subtle cues.</p></list-item><list-item><p>Least similar: examples increase linguistic variability, aiding generalization.</p></list-item><list-item><p>Average similarity: samples serve as class prototypes, anchoring class distinctions.</p></list-item><list-item><p>Random: serves as a baseline for assessing the general value of demonstrations.</p></list-item></list><p>We computed <italic>F</italic><sub>1</sub>-scores for the cognitive impaired class on the validation set across all shot counts (n=2-12). The optimal n for each strategy was selected on the validation set and used for final evaluation on the held-out test set.</p></sec><sec id="s2-3-2"><title>Component 2: Impact of Reasoning-Based Methods on Small LLMs</title><sec id="s2-3-2-1"><title>Overview</title><p>To assess whether explicit reasoning enhances classification accuracy in cognitive impairment detection, we evaluated three reasoning-based prompting strategies across three resource-efficient LLMs: LLaMA 3B, LLaMA 8B, and Ministral 8B Instruct (<xref ref-type="fig" rid="figure3">Figure 3</xref>). To support these smaller models, we incorporated rationales generated either by the models themselves (self-generated) or by larger teacher models (GPT-4o and LLaMA 405B).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Overview of reasoning pipeline. (A) The methods include self-generated reasoning, teacher-generated reasoning, and self-consistency, where reasoning-augmented demonstrations are used for cognitive impaired or CN classification, and self-consistency aggregates multiple runs via majority voting. (B) Tree-of-thought pipeline, where the model is prompted to act as 3 experts, either unspecified or domain-specific, generate tree-of-reasoning (TR<italic>)</italic> and determine the final label by consensus. CI: cognitive impaired; CN: cognitive normal; ToT: tree-of-thought.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e82608_fig03.png"/></fig></sec><sec id="s2-3-2-2"><title>Reasoning-Augmented In-Context Learning (Reasoning-ICL)</title><p>Reasoning-ICL augments each demonstration with an explanatory rationale alongside the input transcription and label, enabling the model to better associate linguistic features with cognitive status. Rationales were sourced from (1) self-generated explanations by the target model, and (2) teacher-generated rationales from a larger LLM (eg, GPT-4o or LLaMA 405B). For each combination, we computed the <italic>F</italic><sub>1</sub>-score for the cognitive impaired class on the validation set and selected the best-performing shot count. The final performance was then evaluated on the held-out test set using this optimal configuration. See <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for prompt design.</p><p>Formally, for each training transcription <inline-formula><mml:math id="ieqn29"><mml:msubsup><mml:mrow><mml:mi>S</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>C</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, a rationale <inline-formula><mml:math id="ieqn30"><mml:msubsup><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> was generated, where <inline-formula><mml:math id="ieqn31"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>x</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mi>f</mml:mi><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>h</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> indicates the source of the explanation. Each demonstration is a triplet:</p><p><inline-formula><mml:math id="ieqn32"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>D</mml:mi><mml:msubsup><mml:mi>M</mml:mi><mml:mi>k</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msubsup><mml:mi>S</mml:mi><mml:mi>k</mml:mi><mml:mi>C</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:msubsup><mml:mi>n</mml:mi><mml:mi>k</mml:mi><mml:mi>x</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:msub><mml:mi>l</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> </p><p>where <inline-formula><mml:math id="ieqn33"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>L</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:mi>k</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mi>C</mml:mi><mml:mi>I</mml:mi><mml:mo>,</mml:mo><mml:mi>C</mml:mi><mml:mi>N</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula></p><p>At inference, the target LLM received the test input <inline-formula><mml:math id="ieqn34"><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, a reasoning-specific instruction <inline-formula><mml:math id="ieqn35"><mml:msub><mml:mrow><mml:mi>I</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, and a set of augmented demonstrations <inline-formula><mml:math id="ieqn36"><mml:msubsup><mml:mrow><mml:mi>D</mml:mi><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> (chosen by the &#x201C;Average&#x201D; demonstration selection strategy introduced in component 1), then jointly generated both a rationale and classification label:</p><disp-formula id="equWL4"><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>&#x2223;</mml:mo><mml:msub><mml:mi>T</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>I</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:msubsup><mml:mi>M</mml:mi><mml:mi>K</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>This framework enabled us to test whether adding structured rationales, generated by either the model itself or a more capable teacher model, improves the model&#x2019;s ability to detect cognitive impairment.</p></sec><sec id="s2-3-2-3"><title>Self-Consistency With Teacher-Generated Reasoning</title><p>To assess whether reasoning-augmented ICL could be further improved by reducing variability in model outputs, we implemented the self-consistency [<xref ref-type="bibr" rid="ref42">42</xref>] method, which aggregates predictions across multiple independently sampled inference runs using a fixed prompt. We restricted self-consistency to teacher-generated rationales, as results from <italic>Results&#x2014;Component 2: Self-consistency with teacher-generated reasoning</italic> (see Results) showed that teacher-based reasoning using rationales from LLaMA 405B consistently outperformed self-generated rationales, GPT-4o rationales, and nonreasoning prompts across most shot counts.</p><p>For each shot count 2 to 12, we used demonstrations augmented with teacher-generated rationales <inline-formula><mml:math id="ieqn37"><mml:msubsup><mml:mrow><mml:mi>D</mml:mi><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>. Each test input <inline-formula><mml:math id="ieqn38"><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> was processed 5 times using the same instruction (<inline-formula><mml:math id="ieqn39"><mml:msub><mml:mrow><mml:mi>I</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> ) and the same set of demonstrations, under two temperature settings: 0.0 for deterministic decoding and 0.5 to introduce controlled randomness. Each run produced a pair: a generated rationale and a corresponding label (<inline-formula><mml:math id="ieqn40"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>, <inline-formula><mml:math id="ieqn41"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>).</p><disp-formula id="equWL5"><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msup><mml:mi>P</mml:mi><mml:mi>i</mml:mi></mml:msup><mml:mo stretchy="false">(</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>&#x2223;</mml:mo><mml:msub><mml:mi>T</mml:mi><mml:mi>S</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>I</mml:mi><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:msubsup><mml:mi>M</mml:mi><mml:mi>K</mml:mi><mml:mrow><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msubsup><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mn>5</mml:mn></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>The final predicted label <inline-formula><mml:math id="ieqn42"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mover><mml:msub><mml:mi>L</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>&#x2322;</mml:mo></mml:mover></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> was computed by majority vote over the 5 predicted labels <inline-formula><mml:math id="ieqn43"><mml:msubsup><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mn>1</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula>, &#x2026;, <inline-formula><mml:math id="ieqn44"><mml:msubsup><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mn>5</mml:mn><mml:mo>)</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula> using the below formula.</p><disp-formula id="equWL6"><mml:math id="eqn5"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mrow><mml:munder><mml:mrow><mml:mi mathvariant="normal">argmax</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo>(</mml:mo><mml:mi mathvariant="italic">ADRD</mml:mi><mml:mo>,</mml:mo><mml:mi mathvariant="normal"> </mml:mi><mml:mi mathvariant="italic">Healthy</mml:mi><mml:mo>)</mml:mo></mml:mrow></mml:munder></mml:mrow><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mrow><mml:msubsup><mml:mo stretchy="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>5</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mn>1</mml:mn><mml:mo>{</mml:mo><mml:msubsup><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mfenced separators="|"><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:mfenced></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>l</mml:mi><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula></sec><sec id="s2-3-2-4"><title>Tree-of-Thought Reasoning</title><p>To evaluate a structured, multistep reasoning approach beyond self-consistency, we implemented the tree-of-thought (ToT) [<xref ref-type="bibr" rid="ref43">43</xref>] prompting framework. This method guides the model to break down decisions into intermediate steps, allowing it to generate and evaluate multiple reasoning paths before producing a final classification. By reasoning step-by-step, the model can retain, revise, or discard partial thoughts, potentially improving coherence and robustness.</p><p>We adopted a zero-shot setup to assess ToT&#x2019;s effectiveness independently of in-context demonstrations. For each test input, the model was prompted to reason from the perspective of 3 simulated experts, each generating a short sequence of reasoning steps. We tested the following 2 prompt formats:</p><list list-type="order"><list-item><p><italic>Unspecified experts:</italic> experts introduced with &#x201C;Imagine three different experts are analyzing a speech transcript.&#x201D;</p></list-item><list-item><p><italic>Domain-specific experts: </italic>experts identified as a language and cognition specialist, a neurocognitive researcher, and a speech-language pathologist.</p></list-item></list><p>Each expert generated up to 2 sequential reasoning steps before providing a final classification. This corresponds to a tree with depth 2 and breadth 3. We capped the depth at 2 steps, as additional steps often led to repetitive or uninformative outputs.</p><p>This setup enabled evaluation of ToT as a standalone reasoning strategy without demonstrations, while maintaining consistent prompt structure and model size across methods. Full prompt templates are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec></sec></sec><sec id="s2-4"><title>Component 3: Fine-Tuning for Binary Classification</title><p>To assess whether task-specific adaptation improves model performance, we fine-tuned a subset of LLMs to classify transcripts as either cognitively impaired or CN. We implemented 2 approaches to fine-tuning.</p><sec id="s2-4-1"><title>Token-Level Supervised Fine-Tuning</title><p>In this approach, classification was framed as a next-token prediction task. Each transcript was paired with a task-specific prompt (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>), and the model was trained to generate the target label token, &#x201C;AD&#x201D; (cognitive impaired) or &#x201C;Healthy&#x201D; (CN). The objective was token-level cross-entropy loss over the model&#x2019;s vocabulary, with the correct label token as the target.</p><p>Fine-tuning was applied to open-weight models (LLaMA 3B, LLaMA 8B, LLaMA 70B, MedAlpaca 7B, and Ministral) using Low-Rank Adaptation (LoRA) [<xref ref-type="bibr" rid="ref44">44</xref>] for parameter-efficient optimization. We used LoRA to constrain fine-tuning to a low-rank update of a small subset of parameters, reducing effective model capacity and thereby lowering the risk of memorization and overfitting in this limited-data setting. We performed a grid search over LoRA rank (32, 64, and 128), dropout (0.00, 0.05, and 0.10), learning rate (2e-4), and batch size (4, 8, and 16), with training epochs from 1 to 13. The best configuration for each model was selected based on the <italic>F</italic><sub>1</sub>-score for the cognitive impaired class on the validation set (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). To quantify training variability due to random initialization and data order, we repeated fine-tuning across 5 distinct random seeds (controlling both data shuffling and adapter initialization) and report the mean <italic>F</italic><sub>1</sub>-score with 95% CIs.</p><p>For commercial models (GPT-4o and Gemini-2.0), we used application programming interface (API)-level fine-tuning options, including learning rate multipliers, training epochs, and adapter or batch size where applicable. Hyperparameter choices were guided by API documentation and prior work. As with open-weight models, final settings were selected based on the <italic>F</italic><sub>1</sub>-score for the cognitive impaired class on the validation set.</p><p>At inference, temperature was fixed at 0.0 for deterministic decoding. Where available, we also extracted class probabilities to compute threshold-independent metrics, specifically the area under the receiver operating characteristic curve. These probabilities were derived from SoftMax-normalized logits assigned to the &#x201C;AD&#x201D; and &#x201C;Healthy&#x201D; tokens in the output layer (see <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref> for more details).</p></sec><sec id="s2-4-2"><title>Classification Head Fine-Tuning</title><p>In this approach, we reframed the classification task by appending a lightweight classification head to the final hidden state of the LLM. The head consisted of 3 fully connected layers (output size: vocabulary dimension &#x2192; 512 &#x2192; 256 &#x2192; 2), following the standard architecture used in Hugging Face implementations [<xref ref-type="bibr" rid="ref45">45</xref>]. It was trained using binary cross-entropy loss to directly map hidden representations to class probabilities.</p><p>Unlike the token-level method, this approach decouples classification from language generation, allowing the model to learn class-specific features from its internal states rather than relying on token prediction. This method was applied only to open-weight models, where hidden representations are accessible.</p><p>Training inputs, prompts, and hyperparameter tuning followed the same procedures as in the first approach (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendices 4</xref> and <xref ref-type="supplementary-material" rid="app5">5</xref>). During inference on the held-out test set, the classification head generated logits for each class, which were then converted into labels for evaluation.</p></sec></sec><sec id="s2-5"><title>Component 4: Evaluating Multimodal LLMs as Classifier</title><p>To evaluate multimodal LLMs for cognitive impairment classification, we tested 3 state-of-the-art models using paired audio and transcripts. All models were prompted to process both modalities and output the patient&#x2019;s cognitive status. <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref> includes details of this prompt.</p><list list-type="bullet"><list-item><p><italic>GPT-4o mini</italic> [<xref ref-type="bibr" rid="ref36">36</xref>]: OpenAI&#x2019;s closed-weight model supporting text and audio inputs. Due to limited access, we performed zero-shot inference using the API with temperature set to 0.</p></list-item><list-item><p><italic>Qwen 2.5 Omni </italic>[<xref ref-type="bibr" rid="ref46">46</xref>]: Evaluated using two strategies:</p><list list-type="bullet"><list-item><p>Zero-shot<italic>:</italic> Run with Hugging Face&#x2019;s recommended parameters (eg, temperature=1.0, top-k=50, top-p=1.0).</p></list-item><list-item><p>Fine-tuning: Performed using LLaMA Factory on training-set audio-transcript pairs. LoRA was used for efficient adaptation with recommended hyperparameters [<xref ref-type="bibr" rid="ref46">46</xref>] (see <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref> for details).</p></list-item></list></list-item><list-item><p><italic>Phi-4 Multimodal </italic>[<xref ref-type="bibr" rid="ref47">47</xref>]: Microsoft&#x2019;s multimodal successor to the Phi series.</p><list list-type="bullet"><list-item><p>Zero-shot<italic>:</italic> Run with Hugging Face&#x2019;s recommended parameters (eg, temperature=1.0, top-k=50, top-p=1.0) using the same instruction prompt as Qwen and GPT-4o.</p></list-item><list-item><p>Fine-tuning<italic>:</italic> Conducted via Hugging Face by a grid search over gradient accumulation steps, number of epochs, and audio length and with recommended LoRA-based settings [<xref ref-type="bibr" rid="ref47">47</xref>] (see <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref> for details and ablation studies).</p></list-item></list></list-item></list><p>For both Qwen and Phi-4, the number of epochs was selected based on the validation of the <italic>F</italic><sub>1</sub>-score for the cognitive impaired class.</p></sec><sec id="s2-6"><title>Error Analysis</title><sec id="s2-6-1"><title>Overview</title><p>We selected fine-tuned GPT-4o and LLaMA 8B for error analysis because they showed consistent high performance across adaptation strategies and represent two practical deployment settings. LLaMA 8B represents a strong open-weight, parameter-efficient model that can be deployed locally in low-resource or privacy-sensitive clinical settings, whereas GPT-4o represents a high-capacity commercial model that operates through an external service. By including one open-weight model and one commercial model, we aimed to provide error insights that are relevant to different deployment settings.</p></sec><sec id="s2-6-2"><title>Qualitative Analysis</title><p>Two team members with expertise in audio analysis independently reviewed all misclassified cases in the held-out test set, including false positives (FP; CN predicted as impaired) and false negatives (FN; cognitive impaired predicted as normal). For each case, reviewers listened to the raw audio and examined the corresponding AWS transcribe output. Each misclassification was annotated for the presence of (1) noise in the audio and (2) missing or partial transcription. We adopted this manual review approach because these error metrics cannot be reliably assessed using automated metrics alone (eg, signal-to-noise ratio-based measures) and due to the lack of gold-standard transcripts.</p></sec><sec id="s2-6-3"><title>Quantitative Analysis</title><p>We computed 25 text-derived metrics across four domains&#x2014;lexical richness (11), syntactic complexity (7), disfluency or repetition (2), and semantic coherence (5) (<xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>)&#x2014;for all test samples. These domains were included because initial analysis showed that the automatic measures were reliable and captured several error patterns observed during human review. Distributions were compared using a 2-sided Mann-Whitney <italic>U</italic> test [<xref ref-type="bibr" rid="ref48">48</xref>] for true positive (TP) versus FN within the cognitive impaired group and true negative (TN) versus FP within the CN group. A <italic>P</italic> value less than .10 was used to flag potential differences.</p></sec></sec><sec id="s2-7"><title>Generalization Beyond ADReSSo: External Validation on Delaware Dataset</title><p>We evaluated the performance of three components on the DementiaBank Delaware dataset [<xref ref-type="bibr" rid="ref49">49</xref>], which includes 3 picture-description tasks (Cookie Theft, Cat Rescue, and Rockwell), a Cinderella story recall, and a procedural discourse task. The dataset consists of 205 English-speaking participants (n=99 with MCI, and n=106 CN).</p><p>We performed a 60%-20%&#x2010;20% participant-level split for training (n=124), validation (n=40), and testing (n=41), ensuring that recordings from each participant appeared in only one split. To calculate the <italic>F</italic><sub>1</sub>-score, we aggregated the predictions for each participant across all tasks, applying majority voting to the predicted labels and comparing them to the ground-truth labels.</p><p>On this dataset, we evaluated adaptation strategies described in component 1&#x2010;3:</p><list list-type="bullet"><list-item><p>Component 1: ICL with demonstration selection&#x2014;We focused on LLaMA 8B and GPT-4o because they showed high and stable performance across demonstration selection strategies. Specifically, we presented the results of the <italic>Most Similar</italic> demonstration selection strategy because it generally outperformed other strategies.</p></list-item><list-item><p>Component 2: Impact of reasoning-based methods on small LLMs&#x2014;Specifically, we used Reasoning-Augmented ICL for LLaMA 8B using rationales generated by LLaMA 405B, which showed the highest performance in our ADReSSo-related experiments.</p></list-item><list-item><p>Component 3: Fine-tuning for binary classification&#x2014;Similar to ICL, we evaluated LLaMA 8B and GPT-4o, which outperformed other open-weight and commercial LLMs in token-level supervised fine-tuning. Both LLMs were trained on the Delaware training set with hyperparameters selected on the validation set, following the hyperparameter selection procedure described for ADReSSo. We repeated fine-tuning across five distinct random seeds and reported the mean performance with 95% CIs.</p></list-item></list></sec><sec id="s2-8"><title>Ethical Considerations</title><p>The data used in this study were obtained from the Pitt Corpus and Delaware Corpus in the DementiaBank database, a publicly available resource hosted by TalkBank. Pitt&#x2019;s original data collection was approved by the Institutional Review Board of the University of Pittsburgh. Delaware&#x2019;s collection was supported by the National Institute of Aging of the National Institutes of Health under award number RF1AG083823. As this study involved secondary analysis of deidentified data, no additional Institutional Review Board approval was required. Informed consent was obtained from participants in the original studies that contributed data to the DementiaBank database.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Throughout this section, <italic>F</italic><sub>1</sub>-scores refer to the cognitive impaired class unless otherwise specified.</p><sec id="s3-1"><title>Component 1: ICL with Demonstration Selection</title><p><xref ref-type="fig" rid="figure4">Figure 4A</xref> presents validation <italic>F</italic><sub>1</sub>-scores for each LLM using 2 to 12 in-context demonstrations across 4 selection strategies. Demonstrations selected by <italic>average similarity</italic> to class centroids achieved the highest or joint-highest <italic>F</italic><sub>1</sub>-scores in 5 models and ranked second in 3 others. The <italic>most similar</italic> strategy generally produced the next-best performance, with notable results for GPT-4o and Gemini-2.0. <italic>Least similar</italic> examples yielded the lowest scores overall, except for MedAlpaca-7B and LLaMA 3B. <italic>Random</italic> selection showed minimal improvement over zero-shot, suggesting limited benefit from unstructured examples. In larger models, performance gains plateaued after 6 demonstrations, indicating reduced sensitivity to demonstration quality, whereas smaller models remained more influenced by selection strategy.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Impact of demonstration selection strategies on in-context learning performance across LLMs. (A) Results for validation: <italic>F</italic><sub>1</sub>-scores for 2-12 demonstrations show average similarity often outperforming other methods, with larger models plateauing after approximately 6 shots and smaller models showing greater sensitivity to selection quality. (B) Results for test: Using optimal shot counts from (A), average similarity achieves the highest scores for most models, while most similar leads in a few cases. Numbers above bars indicate <italic>F</italic><sub>1</sub>-scores &#x00D7;100<italic>.</italic> LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e82608_fig04.png"/></fig><p><xref ref-type="fig" rid="figure4">Figure 4B</xref> shows corresponding results on the test set. <italic>Average similarity</italic> achieved the highest <italic>F</italic><sub>1</sub>-scores in 5 models, including LLaMA 3B (0.73), Ministral 8B (0.73), LLaMA 70B (0.79), GPT-4o (0.81), and DeepSeek-R1 (0.79). <italic>Most similar</italic> was optimal for LLaMA 8B (0.72), LLaMA 405B (0.80), and Gemini-2.0 (0.81). <italic>Least similar</italic> continued to underperform, while MedAlpaca-7B again performed best with random samples (<italic>F</italic><sub>1</sub>=0.67). These results highlighted the importance of selecting representative, class-central demonstrations to enhance generalization in ICL.</p></sec><sec id="s3-2"><title>Component 2: Impact of Reasoning-Based Methods on Small LLMs</title><sec id="s3-2-1"><title>Reasoning-Augmented In-Context Learning (Reasoning-ICL)</title><p>Validation results (<xref ref-type="fig" rid="figure5">Figure 5A</xref>) indicated that adding rationales improved <italic>F</italic><sub>1</sub>-scores across all 3 small LLMs compared with the no-reasoning baseline. Rationales generated by LLaMA 405B yielded the largest gains. With 10 demonstrations, LLaMA 3B achieved an <italic>F</italic><sub>1</sub>-score of 0.78 (vs 0.64 baseline), while with 12 shots, Ministral 8B reached 0.77 (vs 0.61), and LLaMA 8B reached 0.76 (vs 0.72). Rationales from GPT-4o consistently outperformed self-generated rationales but were generally below LLaMA 405B across most shot counts.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Reasoning-augmented in-context learning and self-consistency performance on small LLMs. (A) Reasoning-augmented in-context learning (validation): Adding rationales, especially those generated by LLaMA 405B, improved <italic>F</italic><sub>1</sub>-scores over the no-reasoning baseline, with the largest gains in LLaMA 3B and Ministral 8B. (B) Self-consistency (validation): Majority voting over multiple outputs with LLaMA 405B&#x2013;generated rationales showed minor changes, with temperature adjustments having limited benefit. (C) Reasoning-augmented in-context learning (test): Performance trends differed from validation; unlike validation, best scores varied by model and rationale source. (D) Self-consistency (test): Majority voting slightly improved stability and accuracy for some LLaMA 3B and not for larger models. Numbers above bars indicate <italic>F</italic><sub>1</sub>-scores &#x00D7; 100<italic>.</italic> LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e82608_fig05.png"/></fig><p>Test-set results (<xref ref-type="fig" rid="figure5">Figure 5C</xref>) were less aligned with validation trends. GPT-4o rationales produced the highest <italic>F</italic><sub>1</sub>-score for Ministral 8B (0.72), while LLaMA 405B rationales yielded the best result for LLaMA 8B (0.78). Notably, LLaMA 3B performed best (0.66) with self-generated rationales. These discrepancies indicate that validation-set trends may not reliably reflect a model&#x2019;s generalization behavior, and that performance improvements from specific rationale sources or shot counts should be interpreted with caution.</p></sec><sec id="s3-2-2"><title>Self-Consistency With Teacher-Generated Reasoning</title><p>Using LLaMA 405B-generated rationales, we sampled multiple outputs per input and aggregated predictions by majority vote (<xref ref-type="fig" rid="figure5">Figure 5B</xref>). Even with temperature=0, repeated inferences produced slight variations, reflecting the inherent stochasticity of LLMs. Aggregating predictions via majority voting left LLaMA 8B performance unchanged, but reduced <italic>F</italic><sub>1</sub>-scores from 0.78 to 0.75 for LLaMA 3B and from 0.77 to 0.74 for Ministral 8B. Using a moderate temperature (0.5) increased output variation without improving performance.</p><p>On the test set (<xref ref-type="fig" rid="figure5">Figure 5D</xref>), though self-consistency improved model performance, it did not preserve the validation trend. Majority voting at a temperature of 0.0 increased the <italic>F</italic><sub>1</sub>-score by 0.005 and 0.05 for LLaMA 3B, resulting in a score of 0.72, and LLaMA 8B, resulting in a score of 0.76, with respect to the performance of LLaMA 405 rationales, and lowered by 0.01 for Ministral 8B, reaching 0.67. Results at a temperature of 0.5 were lower for the LLaMA family and only improved Ministral 8B&#x2019;s performance to 0.71. These findings highlight self-consistency as an effective strategy for mitigating prediction instability, an intrinsic property of LLMs, even under deterministic decoding settings.</p></sec><sec id="s3-2-3"><title>ToT Reasoning</title><p>In zero-shot classification, the 3 evaluated models&#x2014;LLaMA 3B, LLaMA 8B, and Ministral 8B&#x2014;achieved baseline <italic>F</italic><sub>1</sub>-scores of 0.73, 0.55, and 0.57, respectively. Applying ToT prompting with unspecified expert roles altered performance to 0.59 (&#x2212;0.10), 0.63 (+0.11), and 0.66 (+0.09) for LLaMA 3B, LLaMA 8B, and Ministral 8B, respectively. When domain-relevant expert roles were incorporated, <italic>F</italic><sub>1</sub>-scores increased to 0.68 (+0.09 vs nonexpert), 0.71 (+0.05), and 0.69 (+0.03), respectively. Compared with zero-shot, expert-role ToT produced notable gains for LLaMA 8B (+0.16) and Ministral 8B (+0.12), but remained below baseline for LLaMA 3B (&#x2212;0.05).</p><p>These findings indicate that expert-grounded prompting can enhance large model performance in cognitive impairment classification, whereas the smaller model, despite benefiting most from expert-role ToT relative to its nonexpert counterpart, may lack the capacity for sustained multi-step reasoning.</p></sec></sec><sec id="s3-3"><title>Component 3: Fine-Tuning for Binary Classification</title><sec id="s3-3-1"><title>Token-Level Supervised Fine-Tuning</title><p><xref ref-type="fig" rid="figure6">Figure 6</xref> compares token-level supervised fine-tuning and classification-head fine-tuning across 6 models, reporting area under the curve [AUC] for the best-performing configuration of each model. Under token-level supervision (<xref ref-type="fig" rid="figure6">Figure 6A</xref>), LLaMA 3B and LLaMA 8B achieved the highest AUCs (0.91 and 0.90) and corresponding <italic>F</italic><sub>1</sub>-scores of 0.83, 95% CI 0.01 and 0.81, 95% CI 0.01, respectively. These were followed by GPT-4o (AUC=0.87, <italic>F</italic><sub>1</sub>=0.79, 95% CI 0.01), LLaMA 70B (AUC=0.86, <italic>F</italic><sub>1</sub>=0.82, 95% CI 0.02), Ministral 8B (AUC=0.83, <italic>F</italic><sub>1</sub>=0.77, 95% CI 0.01), and MedAlpaca 7B (AUC=0.66, <italic>F</italic><sub>1</sub>=0.06, 95% CI 0.02). Performance patterns indicate that smaller and mid-sized models achieved strong class separability, whereas MedAlpaca 7B underperformed, likely due to tokenization-related mismatches with the task data.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p><italic>C</italic>omparison of token-level and classification-head fine-tuning for binary classification on the test set. (A) Token-level fine-tuning shows strong AUC performance for most models, with LLaMA 3B and LLaMA 8B leading and MedAlpaca 7B lagging. (B) Classification-head fine-tuning markedly boosts MedAlpaca 7B but reduces performance for models already strong under token-level training. AUC: area under the curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e82608_fig06.png"/></fig><p>To assess potential overfitting due to validation-based hyperparameter selection, we additionally report validation and test <italic>F</italic><sub>1</sub>-scores for all token-level fine-tuned models in Table 3 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. Across models, validation <italic>F</italic><sub>1</sub>-scores are high and closely aligned with the corresponding test <italic>F</italic><sub>1</sub>-scores, indicating that hyperparameter selection using the validation set did not lead to substantial overfitting, as performance generalized consistently to the held-out test set.</p></sec><sec id="s3-3-2"><title>Classification Head Fine-Tuning</title><p>In contrast, classification-head fine-tuning (<xref ref-type="fig" rid="figure6">Figure 6B</xref>) substantially improved MedAlpaca 7B (AUC=0.92, <italic>F</italic><sub>1</sub>=0.81, 95% CI 0.04, +0.75 improvement), while LLaMA 3B and LLaMA 8B declined to AUC values of 0.89 and 0.88 (<italic>F</italic><sub>1</sub>=0.75, 95% CI 0.03 and 0.80, 95% CI 0.01, respectively). Although Ministral 8B and LLaMA 70B increased the AUC to 0.86 and 0.87, their <italic>F</italic><sub>1</sub>-scores dropped to 0.74, 95% CI 0.02 and 0.79, 95% CI 0.02, respectively. These results suggest that classification-head fine-tuning can markedly benefit models that perform poorly with token-level supervision, while models already performing well under token-level training may not gain, and can even lose, performance when switching to a classification-head approach.</p></sec></sec><sec id="s3-4"><title>Component 4: Evaluating Multimodal LLMs as Classifier</title><p>The following findings have been observed:</p><list list-type="bullet"><list-item><p><italic>GPT-4o Mini:</italic> In the zero-shot setting, GPT-4o Mini achieved a high <italic>F</italic><sub>1</sub>-score for cognitive impaired cases (0.70) but only 0.29 for CN cases, indicating substantial bias toward predicting impairment. Fine-tuning was not performed due to OpenAI&#x2019;s access limitations.</p></list-item><list-item><p><italic>Qwen 2.5-Omni:</italic> Zero-shot performance yielded an <italic>F</italic><sub>1</sub>-score of 0.70 for CN cases and 0.54 for cognitive impaired cases, reflecting a reverse bias toward predicting CN. Fine-tuning did not improve performance and failed to address this imbalance.</p></list-item><list-item><p><italic>Phi-4 Multimodal:</italic> Zero-shot performance was balanced, with <italic>F</italic><sub>1</sub>-scores of 0.53 for cognitive impaired and 0.51 for CN cases. Fine-tuning led to substantial gains, reaching 0.80 for cognitive impaired and 0.75 for CN cases, the highest overall performance and largest improvement among all models.</p></list-item></list><p>These findings indicate that while GPT-4o Mini and Qwen 2.5-Omni performed reasonably in zero-shot mode, both exhibited strong class biases and limited benefit from fine-tuning. In contrast, Phi-4 Multimodal maintained balanced zero-shot performance and responded strongly to fine-tuning, underscoring the importance of task-specific training for robust CN classification.</p></sec><sec id="s3-5"><title>Error Analysis</title><sec id="s3-5-1"><title>Misclassification Overview</title><p>On the held-out test set (n=71), LLaMA 8B produced 6 FNs (TN=30) and 7 FNs (TP=28); GPT-4o produced 8 FPs (TN=28) and 7 FNs (TP=28). Three FPs and 4 FNs overlapped across models. <xref ref-type="fig" rid="figure7">Figure 7</xref> summarizes error metrics for LLaMA 8B and GPT-4o.</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Distribution of linguistic and technical issues contributing to model misclassifications on the test set. (A) LLaMA 8B false positives were primarily due to disfluencies or repetition and semantic coherence. (B) LLaMA 8B false negatives were mainly linked to lexical richness and syntactic complexity. (C) GPT-4o false positives were dominated by disfluencies/repetition, with smaller contributions from semantic coherence and transcription-related issues. (D) GPT-4o false negatives were largely associated with syntactic complexity and lexical richness.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e82608_fig07.png"/></fig></sec><sec id="s3-5-2"><title>Qualitative Analysis</title><p>Two problematic cases were excluded: 1 involved noisy audio with overlapping speech that produced an unrelated transcript and was misclassified by both models as an FP; the other had a missing transcription despite a high-quality audio file and was misclassified by GPT-4o as an FP. The remaining samples were used for quantitative analysis.</p></sec><sec id="s3-5-3"><title>Quantitative Analysis</title><p>Mann-Whitney <italic>U</italic> tests (<italic>P</italic> value &#x003C;.10) showed significant feature differences between correct and incorrect predictions. For GPT-4o, disfluencies or repetition differed for TP versus FN and TN versus FP, and semantic coherence differed for TN versus FP. For LLaMA 8B, syntactic complexity and semantic coherence differed for TP versus FN; lexical richness, semantic coherence, and syntactic complexity differed for TN versus FP. Together, these results suggest that misclassifications arise when a sample&#x2019;s linguistic profile resembles that of the opposite class.</p><p>A limitation of this analysis is that all automatic speech transcription systems are prone to word insertions, repetitions, and truncations, which may have contributed to some of the observed errors and are reflected in our error categorization.</p></sec></sec><sec id="s3-6"><title>External Generalizability Evaluation: DementiaBank Delaware Dataset</title><p>We evaluated components 1-3 as follows:</p><list list-type="bullet"><list-item><p>Component 1: ICL with demonstration selection&#x2014;LLaMA 8B achieved an <italic>F</italic><sub>1</sub>-score of 0.74 with 4 demonstrations for the Most Similar demonstration strategy, while GPT-4o performed better with only 2 demonstrations, reaching an <italic>F</italic><sub>1</sub>-score of 0.80. Adding further demonstrations led to a decline in GPT-4o&#x2019;s performance, suggesting that GPT-4o may be more sensitive to demonstration noise, where additional examples dilute task-relevant signals rather than providing incremental benefit.</p></list-item><list-item><p>Component 2: Impact of reasoning-based methods on small LLMs&#x2014;We next evaluated LLaMA 8B with reasoning-augmented ICL, using rationales generated by LLaMA 405B. This approach resulted in an <italic>F</italic><sub>1</sub>-score of 0.75 with 10 shots on the Delaware dataset, suggesting that reasoning-based ICL can further enhance performance on smaller models by improving the quality of the input prompts.</p></list-item><list-item><p>Component 3: Fine-tuning for binary classification&#x2014;Finally, we fine-tuned both LLaMA 8B and GPT-4o on the Delaware dataset using similar prompts used with the AdReSSo dataset. Token-level fine-tuning led to an improvement for both models: LLaMA 8B reached an <italic>F</italic><sub>1</sub>-score of 0.76, 95% CI 0.02, and GPT-4o reached an <italic>F</italic><sub>1</sub>-score of 0.82, 95% CI 0.03. These results confirm the benefits of fine-tuning LLMs on task-specific data.</p></list-item></list></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study presents the first comprehensive evaluation of multiple adaptation strategies, ICL, reasoning-augmented ICLs, self-consistency, ToT, and supervised fine-tuning across state-of-the-art open-weight and commercial LLMs for detecting early cognitive impairment from speech transcripts (ADReSSo subset of the Pitt Corpus). Fine-tuning yielded the strongest performance: LLaMA 3B, LLaMA 70B, and LLaMA 8B achieved <italic>F</italic><sub>1</sub>-scores of 0.83, 95% CI 0.01; 0.82, 95% CI 0.02; and 0.81, 95% CI 0.01, respectively, outperforming GPT-4o (<italic>F</italic><sub>1</sub>=0.79, 95% CI 0.01). These results show that small open-weight models, when adapted to domain-specific tasks, can match or exceed commercial models, offering practical advantages in scalability and deployment.</p><p>In the context of ICL, the demonstration selection strategy proved critical to performance. Demonstrations selected based on average similarity to class centroids, intended to reflect prototypical speech patterns of CN and impaired individuals, outperformed those based on most similar, least similar, or random selection. This effect was observed across both small and large models, with performance gains plateauing after six examples in larger models. These results highlight the importance of representative, class-central exemplars for guiding model generalization, especially in clinical tasks where linguistic variability may obscure diagnostic signals.</p><p>Teacher-generated rationales from LLaMA 405B or GPT-4o improved reasoning-augmented ICL for smaller models, increasing the <italic>F</italic><sub>1</sub>-score of LLaMA 8B from 0.72 to 0.76. This suggests that teacher-generated reasoning can guide models toward better predictions, reducing adaptation costs by substituting for manually labeled examples. Self-consistency&#x2014;aggregating predictions from repeated runs&#x2014;boosted LLaMA 3B from 0.66 to 0.72 but offered limited benefit for larger models. These findings suggest that self-consistency mitigates prediction variability in smaller LLMs but is less impactful in models with more stable outputs.</p><p>We also observed discrepancies in the performance of some LLMs across the validation and test sets. For example, the average similar demonstration selection strategy yielded the highest <italic>F</italic><sub>1</sub>-score on the validation set for LLaMA 8B, whereas the most similar demonstrations achieved the highest <italic>F</italic><sub>1</sub>-score on the test set. Similarly, in the augmented-reasoning ICL component, LLMs achieved the highest <italic>F</italic><sub>1</sub>-scores with rationales generated by different teacher models on the validation and test sets. These discrepancies were more pronounced in smaller LLMs, which tend to be less generalizable and more sensitive to input variations. As mentioned earlier, the validation set was drawn from the stratified ADReSSo development data, whereas the test set followed the official ADReSSo split and appears to contain more challenging cases with somewhat different linguistic profiles. Such differences in data distribution may account for the observed performance discrepancies in smaller models. Therefore, we recommend interpreting validation scores for smaller, non&#x2013;fine-tuned LLMs with caution and adopting multiple adaptation strategies rather than relying on a single &#x201C;best&#x201D; strategy.</p><p>Token-level fine-tuning outperformed classification-head adaptation for most models. An exception was MedAlpaca-7B, which performed poorly in the token-based setup (<italic>F</italic><sub>1</sub>=0.06, 95% CI 0.02 for cognitive impairment class), likely due to its difficulty generating the correct label token during inference. However, when trained with a classification head, its performance improved substantially (<italic>F</italic><sub>1</sub>=0.81, 95% CI 0.04 for cognitive impairment class). Overall, these results suggest that the optimal fine-tuning formulation depends on how reliably a model can produce discrete label tokens. It is worth mentioning that although fine-tuning LLMs outperforms other adaptation strategies, it might reduce LLMs&#x2019; generalizability for data that lie outside the training data distribution. For example, LLMs fine-tuned on the Cookie Theft picture description task may not result in the best performance on other speech tasks such as story recall.</p><p>Hyperparameter choices for fine-tuning LLMs are dataset- and model-specific and should be re-evaluated when applying fine-tuning to new benchmarks or data from different clinical settings. In this study, we limited hyperparameter selection to a constrained, literature-informed grid (eg, LoRA or quantized low-rank adaptation rank, dropout, learning rate, batch size, and epochs) and selected configurations using a stratified validation subset drawn from the ADReSSo development set. We further evaluated robustness by repeating fine-tuning across 5 random seeds, reporting mean <italic>F</italic><sub>1</sub>-score with 95% CIs, and examining validation-test consistency and ablations (Tables 3-5, <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). The selected hyperparameter values are intended to serve as practical starting points for future work rather than universally optimal settings.</p><p>Overall, multimodal LLMs underperformed relative to text-only LLMs in our experiments. In zero-shot settings, GPT-4o Mini and Qwen 2.5-Omni exhibited pronounced class bias, favoring the cognitive impaired class in GPT-4o Mini and CN class in Qwen 2.5-Omni. Even after fine-tuning, Phi-4 Multimodal (<italic>F</italic><sub>1</sub>=0.80 for cognitive impaired; <italic>F</italic><sub>1</sub>=0.75 for CN) did not match the performance of the best text-only models. These findings suggest that the large, audio branches of current multimodal LLMs are difficult to adapt in small clinical datasets and may introduce variability that propagates errors into the joint audio-text representation rather than providing consistently complementary information. This limitation is likely driven by a combination of insufficient task-specific speech supervision during the process of training and the substantial data requirements for fine-tuning, rather than by a lack of informative acoustic cues in speech itself.</p><p>Consistent with this interpretation, prior work using smaller, speech-focused models such as Wav2Vec and mHuBERT has demonstrated that audio-based markers of cognitive impairment can be learned effectively on datasets of comparable scale. These models benefit from substantially fewer parameters and pretraining objectives explicitly tailored to speech, enabling more efficient adaptation to clinical speech tasks. Together, these results indicate that the observed underperformance of multimodal LLMs reflects current architectural and data-efficiency limitations, rather than a fundamental limitation of audio as a modality for cognitive-impairment detection.</p><p>External evaluation of the adaptation strategies on the DementiaBank Delaware dataset with a distinct population (MCI vs control) and different speech tasks supports the generalizability of the adaptation strategies. Using LLaMA 8B and GPT-4o, the adaptation strategies showed high performance in classifying MCI from control cases. ICL with the most-similar demonstration strategy provided a strong baseline, adding teacher-generated reasoning improved LLaMA 8B&#x2019;s performance compared to ICL, and token-level fine-tuning resulted in the best overall performance. These findings also highlight the adaptation strategy&#x2019;s effectiveness for early detection of MCI.</p><p>Our results suggest a simple decision framework for selecting adaptation strategies in future work. If fine-tuning is not feasible (eg, limited labeled data, limited computational resources), few-shot ICL can still perform strongly, especially when demonstrations are selected as average similarity to class centroids, the most consistent strategy across models. For smaller open-weight models in prompt-only configurations, structured reasoning approaches (eg, teacher-generated rationales or expert-grounded ToT prompting) improved performance under specific conditions, while self-consistency techniques reduced output variability. When labeled training data are available and on-premise computational constraints must be respected, parameter-efficient fine-tuning methods (eg, LoRA or quantized low-rank adaptation) provided the most reliable performance gains, achieving the highest overall accuracy among adaptation strategies. In settings where generative models showed instability in producing relevant tokens, reformulating the fine-tuning with a supervised classification head resulted in more stable and reproducible predictions. Finally, although task-specific fine-tuning improved multimodal audio-text models compared with zero-shot and in-context settings, these LLMs did not outperform the top-performing text-only models that were fine-tuned under the same data and evaluation mechanism. This indicates that, for this study&#x2019;s task and dataset, incorporating acoustic features did not yield additional predictive gains beyond those captured in the transcripts.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>The Pitt Corpus is a widely used benchmark for cognitive impairment detection from speech. Prior studies have used hand-crafted acoustic features (eg, Mel-Frequency Cepstral Coefficients), transformer-based embeddings (eg, Wav2Vec 2.0), rule-based linguistic metrics (eg, Linguistic Inquiry and Word Count), and BERT-based embeddings, achieving <italic>F</italic><sub>1</sub>-scores between 0.70 and 0.87. Notable approaches include fine-tuning BERT-large with Automatic Speech Recognition scores [<xref ref-type="bibr" rid="ref50">50</xref>] (<italic>F</italic><sub>1</sub>=0.85), combining multiple BERT variants with support vector machine [<xref ref-type="bibr" rid="ref38">38</xref>] (<italic>F</italic><sub>1</sub>=0.85), and ensembling logistic regression with fine-tuned BERT or Enhanced Representation through Knowledge Integration [<xref ref-type="bibr" rid="ref51">51</xref>] (<italic>F</italic><sub>1</sub>=0.82). More recent work has leveraged coattention fusion [<xref ref-type="bibr" rid="ref52">52</xref>] (<italic>F</italic><sub>1</sub>=0.86), multimodal fusion with ChatGPT-derived embeddings [<xref ref-type="bibr" rid="ref53">53</xref>] (<italic>F</italic><sub>1</sub>=0.87), and cross-modal attention [<xref ref-type="bibr" rid="ref54">54</xref>] (<italic>F</italic><sub>1</sub>=0.84). In comparison, our LLM-based methods achieved an <italic>F</italic><sub>1</sub>-score almost equal to 0.81 with ICL and 0.80&#x2010;0.83 with fine-tuning, demonstrating competitive performance with state-of-the-art systems.</p><p>These findings have important clinical implications. While biomarker-based tools (eg, blood pTau217 and &#x03B2;-amyloid assays [<xref ref-type="bibr" rid="ref55">55</xref>]) offer diagnostic value, they do not capture functional changes in everyday communication. Language impairment, an early sign of cognitive decline, remains poorly integrated into current screening workflows. LLM-based speech analysis offers a scalable, noninvasive approach to detect linguistic changes that complement biological markers. Integrating these tools into clinical settings [<xref ref-type="bibr" rid="ref56">56</xref>] could enable earlier detection, improve decision-making, and broaden access to timely care.</p><p>Future work should combine LLM-based speech analysis with biological data, evaluate performance across diverse populations to ensure fairness, and address implementation challenges such as clinician acceptance, workflow integration, and regulatory compliance. LLMs, particularly when adapted with representative examples and reasoning strategies, offer a promising foundation for scalable cognitive screening.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has several limitations. First, the use of 2 English datasets (the ADReSSo subset of DementiaBank Pitt Corpus and Delaware) limits generalizability to other speech corpora, languages, or dialects, potentially overlooking broader linguistic variability. Second, the limited training data, especially for multimodal models, may have restricted learning from acoustic inputs, making their underperformance difficult to interpret as model limitations alone. Third, reliance on automatic speech recognition (AWS) introduces transcription errors, particularly in impaired speech, which may disproportionately affect smaller models sensitive to input noise.</p></sec><sec id="s4-4"><title>Conclusion</title><p>This study provides the first systematic comparison of LLM-based adaptation strategies for detecting cognitive impairment from speech. Fine-tuned open-weight models matched or outperformed commercial LLMs and achieved performance comparable to advanced multimodal systems previously built on the benchmark Pitt Corpus. While current multimodal LLMs underperformed, results support LLM-based speech analysis as a scalable and effective approach for early cognitive screening.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This study was supported by the National Institute on Aging through grant K99/R00AG076808 (Development of a Screening Algorithm for Timely Identification of Patients with Mild Cognitive Impairment and Early Dementia in Home Health Care). The research methodology presented in this manuscript is based on this grant, which was reviewed and funded by the National Institute on Aging (score: 18).Additional support was provided by the Columbia Center for Interdisciplinary Research on Alzheimer&#x2019;s Disease Disparities (NIH P30 AG059303), a resource center for minority aging research that offers mentoring, pilot funding, and interdisciplinary research support to investigators addressing disparities in Alzheimer disease and related dementias.</p></sec><sec><title>Data Availability</title><p>The data are available in two sources: (1) the ADReSSo 2021 benchmark dataset, derived from the Pitt Corpus in DementiaBank, comprising 237 participants labeled as cognitive impaired or cognitively healthy, and (2) the Delaware corpus, also derived from DementiaBank, comprising 205 English-speaking participants labeled as mild cognitive impairment or cognitively healthy. The codes for &#x201C;Speech-Based Cognitive Screening&#x201D; are publicly available at GitHub.</p></sec></notes><fn-group><fn fn-type="con"><p>Methodology design: FT, MJMN, YH, HA</p><p>Data analysis: FT, MJMN, SK, SR, AZ</p><p>Drafting the manuscript: FT, MJMN, MZ</p><p>Figure design: MD</p><p>Conceptual model design: MZ</p><p>Critically revising the manuscript: MZ</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ADRD</term><def><p>Alzheimer disease and related dementia</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb4">AWS</term><def><p>Amazon Web Service</p></def></def-item><def-item><term id="abb5">BERT</term><def><p>Bidirectional Encoder Representations from Transformer</p></def></def-item><def-item><term id="abb6">CN</term><def><p>cognitive normal</p></def></def-item><def-item><term id="abb7">FN</term><def><p>false negative</p></def></def-item><def-item><term id="abb8">FP</term><def><p>false positive</p></def></def-item><def-item><term id="abb9">ICL</term><def><p>in-context learning</p></def></def-item><def-item><term id="abb10">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb11">LoRA</term><def><p>Low-Rank Adaptation</p></def></def-item><def-item><term id="abb12">MCI</term><def><p>mild cognitive impairment</p></def></def-item><def-item><term id="abb13">MMSE</term><def><p>Mini-Mental State Examination</p></def></def-item><def-item><term id="abb14">TN</term><def><p>true negative</p></def></def-item><def-item><term id="abb15">TP</term><def><p>true positive</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thies</surname><given-names>W</given-names> </name><name name-style="western"><surname>Bleiler</surname><given-names>L</given-names> </name></person-group><article-title>2013 Alzheimer&#x2019;s disease facts and figures</article-title><source>Alzheimers Dement</source><year>2013</year><month>03</month><volume>9</volume><issue>2</issue><fpage>208</fpage><lpage>245</lpage><pub-id pub-id-type="doi">10.1016/j.jalz.2013.02.003</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zolnour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Topaz</surname><given-names>M</given-names> </name></person-group><article-title>ADscreen: a speech processing-based screening system for automatic identification of patients with Alzheimer&#x2019;s disease and related dementia</article-title><source>Artif Intell Med</source><year>2023</year><month>09</month><volume>143</volume><fpage>102624</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2023.102624</pub-id><pub-id pub-id-type="medline">37673583</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Barr&#x00F3;n</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><etal/></person-group><article-title>HomeADScreen: developing Alzheimer&#x2019;s disease and related dementia risk identification model in home healthcare</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105146</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105146</pub-id><pub-id pub-id-type="medline">37454558</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nichols</surname><given-names>LO</given-names> </name><name name-style="western"><surname>Martindale-Adams</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>EK</given-names> </name><name name-style="western"><surname>Zuber</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Waters</surname><given-names>TM</given-names> </name></person-group><article-title>Impact of the REACH II and REACH VA dementia caregiver interventions on healthcare costs</article-title><source>J Am Geriatr Soc</source><year>2017</year><month>05</month><volume>65</volume><issue>5</issue><fpage>931</fpage><lpage>936</lpage><pub-id pub-id-type="doi">10.1111/jgs.14716</pub-id><pub-id pub-id-type="medline">28295134</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boise</surname><given-names>L</given-names> </name><name name-style="western"><surname>Neal</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Kaye</surname><given-names>J</given-names> </name></person-group><article-title>Dementia assessment in primary care: results from a study in three managed care systems</article-title><source>J Gerontol A Biol Sci Med Sci</source><year>2004</year><month>06</month><volume>59</volume><issue>6</issue><fpage>M621</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1093/gerona/59.6.m621</pub-id><pub-id pub-id-type="medline">15215282</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Toth</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hoffmann</surname><given-names>I</given-names> </name><name name-style="western"><surname>Gosztolya</surname><given-names>G</given-names> </name><etal/></person-group><article-title>A speech recognition-based solution for the automatic detection of mild cognitive impairment from spontaneous speech</article-title><source>Curr Alzheimer Res</source><year>2018</year><month>01</month><day>3</day><volume>15</volume><issue>2</issue><fpage>130</fpage><lpage>138</lpage><pub-id pub-id-type="doi">10.2174/1567205014666171121114930</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><article-title>Assessing cognitive impairment in older patients</article-title><source>National Institute on Aging</source><access-date>2021-03-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nia.nih.gov/health/assessing-cognitive-impairment-older-patients">https://www.nia.nih.gov/health/assessing-cognitive-impairment-older-patients</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Topaz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Landau</surname><given-names>AY</given-names> </name><etal/></person-group><article-title>Natural language processing to identify home health care patients at risk for becoming incapacitated with no evident advance directives or surrogates</article-title><source>J Am Med Dir Assoc</source><year>2024</year><month>08</month><volume>25</volume><issue>8</issue><fpage>105019</fpage><pub-id pub-id-type="doi">10.1016/j.jamda.2024.105019</pub-id><pub-id pub-id-type="medline">38754475</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zolnour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vergez</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Beyond electronic health record data: leveraging natural language processing and machine learning to uncover cognitive insights from patient-nurse verbal communications</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>02</month><day>1</day><volume>32</volume><issue>2</issue><fpage>328</fpage><lpage>340</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae300</pub-id><pub-id pub-id-type="medline">39667364</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cummings</surname><given-names>L</given-names> </name></person-group><article-title>Describing the Cookie Theft picture: sources of breakdown in Alzheimer&#x2019;s dementia</article-title><source>Pragmat. Soc</source><year>2019</year><month>07</month><day>5</day><volume>10</volume><issue>2</issue><fpage>153</fpage><lpage>176</lpage><pub-id pub-id-type="doi">10.1075/ps.17011.cum</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meil&#x00E1;n</surname><given-names>JJG</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez-S&#x00E1;nchez</surname><given-names>F</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez-Nicol&#x00E1;s</surname><given-names>I</given-names> </name><name name-style="western"><surname>Llorente</surname><given-names>TE</given-names> </name><name name-style="western"><surname>Carro</surname><given-names>J</given-names> </name></person-group><article-title>Changes in the rhythm of speech difference between people with nondegenerative mild cognitive impairment and with preclinical dementia</article-title><source>Behav Neurol</source><year>2020</year><volume>2020</volume><fpage>4683573</fpage><pub-id pub-id-type="doi">10.1155/2020/4683573</pub-id><pub-id pub-id-type="medline">32351632</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aramaki</surname><given-names>E</given-names> </name><name name-style="western"><surname>Shikata</surname><given-names>S</given-names> </name><name name-style="western"><surname>Miyabe</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kinoshita</surname><given-names>A</given-names> </name></person-group><article-title>Vocabulary size in speech may be an early indicator of cognitive impairment</article-title><source>PLoS ONE</source><year>2016</year><volume>11</volume><issue>5</issue><fpage>e0155195</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0155195</pub-id><pub-id pub-id-type="medline">27176919</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zolnour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rashidi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Detecting mild cognitive impairment using follow-up call speech and electronic health record data in home health care settings</article-title><source>J Gerontol Nurs</source><year>2026</year><month>01</month><volume>52</volume><issue>1</issue><fpage>8</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.3928/00989134-20251208-03</pub-id><pub-id pub-id-type="medline">41439666</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><access-date>2026-02-27</access-date><conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</conf-name><conf-date>Jun 2-7, 2019</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/N19-1423/">https://aclanthology.org/N19-1423/</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name></person-group><article-title>Efficient estimation of word representations in vector space</article-title><access-date>2026-02-27</access-date><conf-name>1st International Conference on Learning Representations, ICLR 2013</conf-name><conf-date>May 2-4, 2013</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dblp.org/db/conf/iclr/iclr2013w.html#conf/iclr/abs-1301-3781">https://dblp.org/db/conf/iclr/iclr2013w.html#conf/iclr/abs-1301-3781</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azadmaleki</surname><given-names>H</given-names> </name><name name-style="western"><surname>Haghbin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rashidi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>SpeechCARE: harnessing multimodal innovation to transform cognitive impairment detection - insights from the National Institute on Aging Alzheimer&#x2019;s Speech Challenge</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>08</month><day>7</day><volume>329</volume><fpage>1856</fpage><lpage>1857</lpage><pub-id pub-id-type="doi">10.3233/SHTI251249</pub-id><pub-id pub-id-type="medline">40776266</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azadmaleki</surname><given-names>H</given-names> </name><name name-style="western"><surname>Haghbin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rashidi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Momeni Nezhad</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Zolnour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name></person-group><article-title>SpeechCARE: dynamic multimodal modeling for cognitive screening in diverse linguistic and speech task contexts</article-title><source>NPJ Digit Med</source><year>2025</year><month>11</month><day>17</day><volume>8</volume><issue>1</issue><fpage>677</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-02026-x</pub-id><pub-id pub-id-type="medline">41249382</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zolnoor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Azadmaleki</surname><given-names>H</given-names> </name><name name-style="western"><surname>Haghbin</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>National institute on aging PREPARE challenge: early detection of cognitive impairment using speech: the SpeechCARE solution</article-title><source>arXiv</source><access-date>2026-01-08</access-date><comment>Preprint posted online on  Nov 11, 2025</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2511.08132">https://arxiv.org/pdf/2511.08132</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2511.08132</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rashidi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Azadmaleki</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zolnour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Momeni Nezhad</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name></person-group><article-title>SpeechCura: a novel speech augmentation framework to tackle data scarcity in healthcare</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>08</month><day>7</day><volume>329</volume><fpage>1858</fpage><lpage>1859</lpage><pub-id pub-id-type="doi">10.3233/SHTI251250</pub-id><pub-id pub-id-type="medline">40776267</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brow</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><access-date>2026-02-27</access-date><conf-name>NIPS&#x2019;20: Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.5555/3495724.3495883?download=true">https://dl.acm.org/doi/abs/10.5555/3495724.3495883?download=true</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>P</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Topaz</surname><given-names>M</given-names> </name></person-group><article-title>From conversation to standardized terminology: an LLM-RAG approach for automated health problem identification in home healthcare</article-title><source>J Nurs Scholarsh</source><year>2025</year><month>11</month><volume>57</volume><issue>6</issue><fpage>1003</fpage><lpage>1011</lpage><pub-id pub-id-type="doi">10.1111/jnu.70039</pub-id><pub-id pub-id-type="medline">40785044</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hosseini</surname><given-names>SMB</given-names> </name><name name-style="western"><surname>Momeni Nezhad</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Hosseini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name></person-group><article-title>Optimizing entity recognition in psychiatric treatment data with large language models</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>08</month><day>7</day><volume>329</volume><fpage>784</fpage><lpage>788</lpage><pub-id pub-id-type="doi">10.3233/SHTI250947</pub-id><pub-id pub-id-type="medline">40775965</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Nezhad</surname><given-names>MJM</given-names> </name><name name-style="western"><surname>Hosseini</surname><given-names>SMB</given-names> </name><etal/></person-group><article-title>A scoping review of large language model applications in healthcare</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>08</month><day>7</day><volume>329</volume><fpage>1966</fpage><lpage>1967</lpage><pub-id pub-id-type="doi">10.3233/SHTI251302</pub-id><pub-id pub-id-type="medline">40776319</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zolnour</surname><given-names>A</given-names> </name><name name-style="western"><surname>Azadmaleki</surname><given-names>H</given-names> </name><name name-style="western"><surname>Haghbin</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>LLMCARE: early detection of cognitive impairment via transformer models enhanced by LLM-generated synthetic data</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1669896</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1669896</pub-id><pub-id pub-id-type="medline">41280882</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Nezhad</surname><given-names>MJM</given-names> </name><name name-style="western"><surname>Hosseini</surname><given-names>SMB</given-names> </name><etal/></person-group><article-title>Advancing healthcare with large language models: a scoping review of applications and future directions</article-title><source>Int J Med Inform</source><year>2026</year><month>03</month><day>15</day><volume>208</volume><fpage>106231</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2025.106231</pub-id><pub-id pub-id-type="medline">41443123</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Qian</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>M</given-names> </name></person-group><article-title>Depression detection in clinical interviews with LLM-empowered structural element graph</article-title><access-date>2026-02-27</access-date><conf-name>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 16-21, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.naacl-long.452/">https://aclanthology.org/2024.naacl-long.452/</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2024.naacl-long.452</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sabour</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>TMC</given-names> </name></person-group><article-title>Enhanced large language models for effective screening of depression and anxiety</article-title><source>Commun Med (Lond)</source><year>2025</year><month>11</month><day>5</day><volume>5</volume><issue>1</issue><fpage>457</fpage><pub-id pub-id-type="doi">10.1038/s43856-025-01158-1</pub-id><pub-id pub-id-type="medline">41193601</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Koushik</surname><given-names>L</given-names> </name><name name-style="western"><surname>Vishruth</surname><given-names>M</given-names> </name><name name-style="western"><surname>Anand Kumar</surname><given-names>M</given-names> </name></person-group><article-title>Detecting suicide risk patterns using hierarchical attention networks with large language models</article-title><access-date>2026-02-27</access-date><conf-name>Proceedings of the 9th Workshop on Computational Linguistics and Clinical Psychology (CLPsych 2024)</conf-name><conf-date>Mar 21, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.clpsych-1.21/">https://aclanthology.org/2024.clpsych-1.21/</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2024.clpsych-1.21</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A scoping review on generative AI and large language models in mitigating medication related harm</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>28</day><volume>8</volume><issue>1</issue><fpage>182</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01565-7</pub-id><pub-id pub-id-type="medline">40155703</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Pichai</surname><given-names>S</given-names> </name></person-group><article-title>Google AI updates: Bard and new AI features in search</article-title><source>Google Blog</source><year>2023</year><month>02</month><day>6</day><access-date>2025-07-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://blog.google/technology/ai/bard-google-ai-search-updates/">https://blog.google/technology/ai/bard-google-ai-search-updates/</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lanzi</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Saylor</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Fromm</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>MacWhinney</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>ML</given-names> </name></person-group><article-title>DementiaBank: theoretical rationale, protocol, and illustrative analyses</article-title><source>Am J Speech Lang Pathol</source><year>2023</year><month>03</month><day>9</day><volume>32</volume><issue>2</issue><fpage>426</fpage><lpage>438</lpage><pub-id pub-id-type="doi">10.1044/2022_AJSLP-22-00281</pub-id><pub-id pub-id-type="medline">36791255</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The LLaMA 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Mistralai/ministral-8B-instruct-2410</article-title><source>Hugging Face</source><access-date>2025-07-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/mistralai/Ministral-8B-Instruct-2410">https://huggingface.co/mistralai/Ministral-8B-Instruct-2410</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ha</surname><given-names>T</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Papaioannou</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>MedAlpaca: an open-source collection of medical conversational AI models and training data</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 14, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.08247</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><collab>DeepSeek-AI</collab><etal/></person-group><article-title>DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 22, 2025</comment><pub-id pub-id-type="doi">10.1038/s41586-025-09422-z</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="report"><article-title>GPT-4o system card</article-title><year>2024</year><month>08</month><day>4</day><access-date>2026-02-28</access-date><publisher-name>OpenAI</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/gpt-4o-system-card.pdf">https://cdn.openai.com/gpt-4o-system-card.pdf</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Pichai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hassabis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kavukcuoglu</surname><given-names>K</given-names> </name></person-group><article-title>Introducing gemini 2.0: our new AI model for the agentic era</article-title><source>Google Blog</source><year>2024</year><month>12</month><day>11</day><access-date>2025-07-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/">https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Syed</surname><given-names>ZS</given-names> </name><name name-style="western"><surname>Syed</surname><given-names>MSS</given-names> </name><name name-style="western"><surname>Lech</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pirogova</surname><given-names>E</given-names> </name></person-group><article-title>Tackling the ADReSSo challenge 2021: the MUET-RMIT system for Alzheimer&#x2019;s dementia recognition from spontaneous speech</article-title><access-date>2026-02-27</access-date><conf-name>22nd Annual Conference of the International Speech Communication Association &#x2013; INTERSPEECH 2021</conf-name><conf-date>Aug 30 to Sep 3, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2021">https://www.isca-archive.org/interspeech_2021</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2021-1572</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lanzi</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Saylor</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Fromm</surname><given-names>D</given-names> </name><name name-style="western"><surname>MacWhinney</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>ML</given-names> </name></person-group><article-title>Establishing the DementiaBank Delaware Corpus: an online multimedia database for the study of language and cognition in dementia</article-title><source>Alzheimers Dement</source><year>2023</year><month>12</month><volume>19</volume><issue>S19</issue><fpage>e073058</fpage><pub-id pub-id-type="doi">10.1002/alz.073058</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="web"><article-title>Amazon Transcribe</article-title><source>Amazon Web Services (AWS)</source><access-date>2025-07-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/transcribe">https://aws.amazon.com/transcribe</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lian</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name></person-group><article-title>M3-embedding: multi-linguality, multi-functionality, multi-granularity text embeddings through self-knowledge distillation</article-title><access-date>2026-02-27</access-date><conf-name>Findings of the Association for Computational Linguistics: ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.findings-acl.137/">https://aclanthology.org/2024.findings-acl.137/</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.137</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Self-consistency improves chain of thought reasoning in language models</article-title><access-date>2025-07-21</access-date><conf-name>11th International Conference on Learning Representations (ICLR 2023)</conf-name><conf-date>May 1-5, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=1PL1NIMMrw">https://openreview.net/forum?id=1PL1NIMMrw</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Tree of thoughts: deliberate problem solving with large language models</article-title><access-date>2026-02-27</access-date><conf-name>NIPS &#x2019;23: Proceedings of the 37th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 10-16, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.5555/3666122.3666639">https://dl.acm.org/doi/abs/10.5555/3666122.3666639</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dettmers</surname><given-names>T</given-names> </name><name name-style="western"><surname>Holtzman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pagnoni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zettlemoyer</surname><given-names>L</given-names> </name></person-group><article-title>QLoRA: efficient finetuning of quantized LLMs</article-title><access-date>2026-02-27</access-date><conf-name>NIPS &#x2019;23: Proceedings of the 37th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 10-16, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3666122.3666563">https://dl.acm.org/doi/10.5555/3666122.3666563</ext-link></comment><pub-id pub-id-type="doi">10.52202/075280-0441</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="web"><article-title>AutoModels &#x2014; transformers 3.0.2 documentation</article-title><source>Hugging Face</source><access-date>2025-07-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/transformers/v3.0.2/model_doc/auto.html">https://huggingface.co/transformers/v3.0.2/model_doc/auto.html</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Qwen2.5-Omni technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 26, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.20215</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Microsoft</collab><name name-style="western"><surname>Abouelenin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ashfaq</surname><given-names>A</given-names> </name><name name-style="western"><surname>Atkinson</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Phi-4-mini technical report: compact yet powerful multimodal language models via mixture-of-LoRAs</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 3, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.01743</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mann</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Whitney</surname><given-names>DR</given-names> </name></person-group><article-title>On a test of whether one of two random variables is stochastically larger than the other</article-title><source>Ann Math Statist</source><year>1947</year><month>03</month><volume>18</volume><issue>1</issue><fpage>50</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.1214/aoms/1177730491</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="web"><article-title>DementiaBank English protocol Delaware Corpus</article-title><source>TalkBank</source><access-date>2025-07-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://talkbank.org/dementia/access/English/Protocol/Delaware.html">https://talkbank.org/dementia/access/English/Protocol/Delaware.html</ext-link></comment></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mirheidari</surname><given-names>B</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Using the outputs of different automatic speech recognition paradigms for acoustic- and BERT-based Alzheimer&#x2019;s dementia detection through spontaneous speech</article-title><access-date>2026-02-28</access-date><conf-name>22nd Annual Conference of the International Speech Communication Association, Interspeech 2021</conf-name><conf-date>Aug 30 to Sep 3, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2021/pan21c_interspeech.html">https://www.isca-archive.org/interspeech_2021/pan21c_interspeech.html</ext-link></comment><pub-id pub-id-type="doi">10.21437/INTERSPEECH.2021-1519</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Qiao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wiechmann</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kerz</surname><given-names>E</given-names> </name></person-group><article-title>Alzheimer&#x2019;s disease detection from spontaneous speech through combining linguistic complexity and (dis)fluency features with pretrained language models</article-title><access-date>2026-02-28</access-date><conf-name>Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH 2021</conf-name><conf-date>Aug 30 to Sep 3, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2021/qiao21_interspeech.html?utm_source=chatgpt.com#">https://www.isca-archive.org/interspeech_2021/qiao21_interspeech.html?utm_source=chatgpt.com#</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2021-1415</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ilias</surname><given-names>L</given-names> </name><name name-style="western"><surname>Askounis</surname><given-names>D</given-names> </name></person-group><article-title>Context-aware attention layers coupled with optimal transport domain adaptation and multimodal fusion methods for recognizing dementia from spontaneous speech</article-title><source>Knowl Based Syst</source><year>2023</year><month>10</month><volume>277</volume><fpage>110834</fpage><pub-id pub-id-type="doi">10.1016/j.knosys.2023.110834</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bang</surname><given-names>JU</given-names> </name><name name-style="western"><surname>Han</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>BO</given-names> </name></person-group><article-title>Alzheimer&#x2019;s disease recognition from spontaneous speech using large language models</article-title><source>ETRI Journal</source><year>2024</year><month>02</month><volume>46</volume><issue>1</issue><fpage>96</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.4218/etrij.2023-0356</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>Modality fusion using auxiliary tasks for dementia detection</article-title><source>Comput Speech Lang</source><year>2026</year><month>01</month><volume>95</volume><fpage>101814</fpage><pub-id pub-id-type="doi">10.1016/j.csl.2025.101814</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="web"><article-title>FDA clears first blood test used in diagnosing Alzheimer&#x2019;s disease</article-title><source>US Food and Drug Administration</source><year>2025</year><month>05</month><day>16</day><access-date>2025-06-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/news-events/press-announcements/fda-clears-first-blood-test-used-diagnosing-alzheimers-disease">https://www.fda.gov/news-events/press-announcements/fda-clears-first-blood-test-used-diagnosing-alzheimers-disease</ext-link></comment></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zolnoori</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vergez</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kostic</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Audio recording patient-nurse verbal communications in home health care settings: pilot feasibility and usability study</article-title><source>JMIR Hum Factors</source><year>2022</year><month>05</month><day>11</day><volume>9</volume><issue>2</issue><fpage>e35325</fpage><pub-id pub-id-type="doi">10.2196/35325</pub-id><pub-id pub-id-type="medline">35544296</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>In-context learning with demonstration selection prompt design.</p><media xlink:href="ai_v5i1e82608_app1.docx" xlink:title="DOCX File, 119 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Reasoning-based methods prompt design.</p><media xlink:href="ai_v5i1e82608_app2.docx" xlink:title="DOCX File, 202 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Tree-of-thought reasoning prompt design.</p><media xlink:href="ai_v5i1e82608_app3.docx" xlink:title="DOCX File, 434 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Fine-tuning prompt design.</p><media xlink:href="ai_v5i1e82608_app4.docx" xlink:title="DOCX File, 117 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Fine-tuning details and hyperparameters.</p><media xlink:href="ai_v5i1e82608_app5.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Token probability-based classification details.</p><media xlink:href="ai_v5i1e82608_app6.docx" xlink:title="DOCX File, 13 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Multimodal large language models fine-tuning prompt.</p><media xlink:href="ai_v5i1e82608_app7.docx" xlink:title="DOCX File, 58 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Multimodal large language models fine-tuning hyperparameters.</p><media xlink:href="ai_v5i1e82608_app8.docx" xlink:title="DOCX File, 47 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Definitions of linguistic measures.</p><media xlink:href="ai_v5i1e82608_app9.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>