<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="review-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e80123</article-id><article-id pub-id-type="doi">10.2196/80123</article-id><article-categories><subj-group subj-group-type="heading"><subject>Review</subject></subj-group></article-categories><title-group><article-title>Large Language Model&#x2013;Based Agents for Physical Activity and Cognitive Training: Scoping Review</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Silacci</surname><given-names>Alessandro</given-names></name><degrees>BSc, MSc, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Giachetti</surname><given-names>Benedetta</given-names></name><degrees>BSc, MSc</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Angelini</surname><given-names>Leonardo</given-names></name><degrees>BSc, MSc, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lopomo</surname><given-names>Nicola Francesco</given-names></name><degrees>MSc, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Andreoni</surname><given-names>Giuseppe</given-names></name><degrees>MSc, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mugellini</surname><given-names>Elena</given-names></name><degrees>BSc, MSc, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cherubini</surname><given-names>Mauro</given-names></name><degrees>BA, MA, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Caon</surname><given-names>Maurizio</given-names></name><degrees>BSc, MSc, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Information Systems, Faculty of Business and Economics, University of Lausanne</institution><addr-line>Quartier Centre</addr-line><addr-line>Lausanne</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>Digital Business Center, School of Management Fribourg, HES-SO University of Applied Sciences and Arts Western Switzerland</institution><addr-line>Fribourg</addr-line><country>Switzerland</country></aff><aff id="aff3"><institution>HumanTech Institute, School of Engineering and Architecture Fribourg, HES-SO University of Applied Sciences and Arts Western Switzerland, Fribourg, Switzerland</institution><addr-line>Fribourg</addr-line><country>Switzerland</country></aff><aff id="aff4"><institution>Department of Informatics, Faculty of Science and Medicine, University of Fribourg</institution><addr-line>Fribourg</addr-line><country>Switzerland</country></aff><aff id="aff5"><institution>Design Department, Politecnico di Milano</institution><addr-line>Milan</addr-line><country>Italy</country></aff><aff id="aff6"><institution>Bioengineering Laboratory, Scientific Institute IRCCS E. Medea</institution><addr-line>Bosisio Parini</addr-line><country>Italy</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Emam</surname><given-names>Khaled El</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Sezgin</surname><given-names>Emre</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Santos</surname><given-names>Jos&#x00E9;</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Alessandro Silacci, BSc, MSc, PhD, Department of Information Systems, Faculty of Business and Economics, University of Lausanne, Quartier Centre, Lausanne, 1015, Switzerland, 41 21 692 11 11; <email>alessandro.silacci@unil.ch</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>12</day><month>3</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e80123</elocation-id><history><date date-type="received"><day>04</day><month>07</month><year>2025</year></date><date date-type="rev-recd"><day>15</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>18</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Alessandro Silacci, Benedetta Giachetti, Leonardo Angelini, Nicola Francesco Lopomo, Giuseppe Andreoni, Elena Mugellini, Mauro Cherubini, Maurizio Caon. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 12.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e80123"/><abstract><sec><title>Background</title><p>Large language model (LLM)&#x2013;based conversational agents have been increasingly used in digital health interventions. However, their specific application to physical activity (PA) and cognitive training&#x2014;two critical well-being domains&#x2014;has not been systematically mapped. In fact, these domains share an important need for personalized, adaptive support and conversational engagement, making them relevant targets for examining how LLM-based agents are currently conceptualized and deployed.</p></sec><sec><title>Objective</title><p>This scoping review aimed to map the extent, characteristics, and design practices of LLM-based conversational agents supporting PA or cognitive training, specifically analyzing their application contexts, social roles, and technological features.</p></sec><sec sec-type="methods"><title>Methods</title><p>Following PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews) guidelines, we searched Web of Science, Scopus, PubMed, ACM Digital Library, and IEEE Xplore for studies published between January 2018 and December 2024. We included eligible studies that described LLM-based conversational agents designed for PA or cognitive training. Two reviewers independently screened records and extracted data. Descriptive synthesis and framework analysis were used to characterize intervention domains, agent roles, prompting strategies, model types, and reported outcomes.</p></sec><sec sec-type="results"><title>Results</title><p>Of 357 records screened, 10 studies met eligibility criteria (7 on PA and 3 on cognitive training). Applications predominantly involved coaching roles for PA and companion or scaffolding roles in cognitive domains. The agent landscape was dominated by proprietary LLMs (GPT-3.5, GPT-4, and Bard), with limited use of open-weight models. Prompt engineering emerged as a central yet inconsistently documented design mechanism. Reported outcomes mainly focused on perceived usefulness, engagement, or content quality, with few quantitative behavioral outcomes.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLM-based conversational agents have demonstrated early promise for supporting PA and emerging approaches to cognitive training, yet the current evidence remains exploratory and methodologically limited. Key challenges persist, including inconsistent reporting of prompts, reliance on proprietary models with limited reproducibility, and a lack of standardized outcome measures. More rigorous and transparently documented evaluations of these tools are required to strengthen the evidence base and guide future development.</p></sec></abstract><kwd-group><kwd>cognitive training</kwd><kwd>conversational agents</kwd><kwd>large language models</kwd><kwd>physical activity</kwd><kwd>prompt engineering</kwd><kwd>reproducibility</kwd><kwd>scoping review</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Conversational agents (CAs) have been increasingly integrated into digital health interventions, offering scalable and personalized support for health-related behavioral change [<xref ref-type="bibr" rid="ref1">1</xref>]. Within intervention domains such as physical activity (PA) promotion and cognitive health, CAs have reported promising results in fostering user engagement, supporting self-regulation, and enhancing adherence. These agents often mimic human dialogue to educate [<xref ref-type="bibr" rid="ref2">2</xref>], prompt reflection [<xref ref-type="bibr" rid="ref3">3</xref>], or guide users toward their behavior goals [<xref ref-type="bibr" rid="ref4">4</xref>]. However, earlier conversational systems&#x2014;particularly those based on prescripted or narrowly scoped interaction models&#x2014;often failed to support the flexible, conversational interaction users expected, instead requiring command-like and highly constrained input that many users experienced as frustrating or unnatural [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>The recent emergence of large language models (LLMs), such as OpenAI&#x2019;s ChatGPT, introduces a fundamental shift in how CAs can be developed and deployed [<xref ref-type="bibr" rid="ref6">6</xref>]. In fact, LLMs offer open-ended dialogue capabilities, context-sensitive responses, and general-purpose reasoning [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]&#x2014;qualities that may greatly enhance the reach and efficacy of behavioral interventions. Early research suggests that LLM-powered agents can emulate counseling techniques, adapt their tone and content dynamically, and even facilitate therapeutic alliance-like interactions [<xref ref-type="bibr" rid="ref10">10</xref>]. These affordances align with long-standing human-computer interaction (HCI) priorities, particularly the call for emotionally intelligent, adaptive, and personalized systems to support human well-being [<xref ref-type="bibr" rid="ref11">11</xref>]. As such, LLM-based CAs are especially relevant in health domains where motivation, personalization, and sustained engagement are critical.</p><p>Despite the promise of LLMs, there is currently a lack of systematic understanding of how LLM-based CAs are being used, or could be used, in the context of PA and cognitive interventions. Existing literature reviews have examined artificial intelligence (AI)&#x2013;powered CAs across domains, such as PA [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], obesity treatment [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>], and mental health [<xref ref-type="bibr" rid="ref18">18</xref>]. These studies highlighted several recurring limitations, including conversational rigidity, shallow personalization, limited contextual awareness, and repetitive or unnatural dialogue patterns [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Such shortcomings can hinder user engagement and learning effectiveness, particularly in interventions that rely on sustained motivation and adaptive feedback, as is the case for both PA coaching and cognitive training systems [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. These weaknesses highlight a critical design gap that LLMs are poised to address, thanks to their capacity for open-ended interaction, adaptive tone, and flexible role construction. However, current reviews do not account for the distinctive capabilities, challenges, and design considerations introduced by this new class of models. As LLMs continue to gain traction in both academic and commercial health technologies, a timely synthesis is needed to chart the emerging landscape, identify key design patterns, and highlight open research questions.</p><p>Despite being distinct domains, PA and cognitive training are frequently integrated and combined in the literature, particularly within interactive and digitally mediated interventions [<xref ref-type="bibr" rid="ref23">23</xref>]. Indeed, PA produces well-established cognitive and neurobiological effects [<xref ref-type="bibr" rid="ref24">24</xref>]. Moreover, combined physical-cognitive training has been shown to yield synergistic benefits compared to single-domain approaches, although it remains subject to common challenges related to engagement, adherence, and intervention design. Recent interactive and exergame-based systems further demonstrate that physical and cognitive components are deeply intertwined in user interaction and system design, rather than implemented as isolated modalities [<xref ref-type="bibr" rid="ref25">25</xref>]. Accordingly, this review considers both domains to examine how LLM-based CAs are designed and evaluated in well-being interventions. The focus is on their shared interactional, motivational, and personalization mechanisms, rather than on comparing clinical outcomes across domains.</p><p>This review addresses that gap by systematically mapping how LLM-based CAs are conceptualized, applied, and evaluated in interventions addressing PA and cognitive training. Our contribution is 2-fold. First, we characterize the state of the art in this fast-moving field, including system features, use contexts, and intended outcomes. Second, and more critically, we move beyond summary to dissect <italic>how</italic> these systems are built and the scientific challenges this creates. We specifically analyze the practice of prompt engineering as an informal yet central design mechanism. Furthermore, we highlight how the prevalent use of proprietary &#x201C;black box&#x201D; models and inconsistent documentation pose a fundamental threat to reproducibility, hindering the field&#x2019;s cumulative scientific progress. By surfacing these methodological risks, we provide a necessary critical perspective that complements and advances prior work on digital health agents.</p></sec><sec id="s1-2"><title>Objectives</title><p>This scoping review explores the role of LLM-based CAs in supporting individuals&#x2019; PA and cognitive training, providing a comprehensive overview and critical evaluation of their impact. Specifically, it examines how these AI-driven agents facilitate engagement, personalize interactions, and address challenges in interventions aimed at enhancing both physical and cognitive well-being.</p><p>To structure this analysis, we investigated 3 key research questions (RQs), focusing on their applications, social dynamics, and integration with complementary technologies:</p><list list-type="bullet"><list-item><p>RQ1. In what ways have LLM-based CAs been applied to support well-being, particularly in the contexts of PA and cognitive training?</p></list-item><list-item><p>RQ2. How does existing literature characterize the social roles of LLM-based CAs in PA and cognitive training interventions?</p></list-item><list-item><p>RQ3. What additional technologies or design features are integrated with LLM-based CAs to enhance their effectiveness in PA and cognitive training interventions?</p></list-item></list></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This scoping review analyzes the landscape of LLM-based CAs for PA and cognitive training, following the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews) guidelines (<xref ref-type="supplementary-material" rid="app2">Checklist 1</xref>) [<xref ref-type="bibr" rid="ref26">26</xref>] to enhance the transparency and completeness of reporting.</p></sec><sec id="s2-2"><title>Search Strategy</title><p>To identify relevant studies, keywords related to CAs, PA, cognition, and well-being were derived from a preliminary literature overview [<xref ref-type="bibr" rid="ref27">27</xref>]. In parallel, LLM-specific keywords were selected based on prior LLM reviews and the authors&#x2019; domain expertise. Searches were primarily performed using combinations of these keyword groups (K1, K2, K3, ... Kn) across Clarivate Web of Science (WoS) and Elsevier Scopus, 2 databases recognized for their comprehensive coverage of peer-reviewed academic research [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. To ensure comprehensive coverage, we further included PubMed, ACM Digital Library, and IEEE Xplore in our search based on their frequent use in prior literature reviews [<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec><sec id="s2-3"><title>Keywords and Queries</title><p>Keywords were collected based on other literature reviews&#x2019; requests involving agents used in PA and cognitive training. Queries included papers from January 2018 to December 2024, as this marks the introduction of Bidirectional Encoder Representations from Transformers, the first language model to enable bidirectional language understanding, a foundational feature for LLMs [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>For full transparency and reproducibility, the complete and exact search query strings used for each database are openly accessible in a findable, accessible, interoperable, reusable (FAIR)&#x2013;compliant repository [<xref ref-type="bibr" rid="ref27">27</xref>]. As an example of our search structure, we used two main queries: one combining keywords from group K1 (agents), group K2 (technology), and group K3 (physical activity); and a second combining keywords from group K1 (agents), group K2 (technology), and group K4 (cognition), all using Boolean operators. A simplified representation of the search logic for these two distinct queries was as follows:</p><list list-type="bullet"><list-item><p>Query 1: (<italic>K1 keywords) AND (K2 keywords) AND (K3 keywords</italic>)</p></list-item><list-item><p>Query 2: (<italic>K1 keywords) AND (K2 keywords) AND (K4 keywords</italic>)</p></list-item></list><p>This structure was adapted to the specific syntax requirements of WoS and Elsevier Scopus.</p></sec><sec id="s2-4"><title>Study Selection</title><p>Inclusion criteria were established and adapted from previous research on CAs for well-being [<xref ref-type="bibr" rid="ref31">31</xref>]. Results from query 1 and query 2 were uploaded to the Covidence [<xref ref-type="bibr" rid="ref32">32</xref>] platform, where all authors were invited to participate in the review process.</p><p>The first selection phase assessed studies based on their title, abstract, and keywords, applying all inclusion and exclusion criteria except full-text availability and, in some instances, language. Following an initial pilot screening round (14 papers of which 5 were selected and 9 were irrelevant, ~36% eligible, details available as supplementary materials [<xref ref-type="bibr" rid="ref27">27</xref>]), the inclusion and exclusion criteria were refined to improve clarity and consistency among reviewers. During the full-text screening phase, the complete set of eligibility criteria was applied (cf, Supplementary Material [<xref ref-type="bibr" rid="ref27">27</xref>]). In both phases, each study was independently evaluated by at least two reviewers. In cases of disagreement, a third reviewer was consulted, and discussions were held when necessary to reach consensus.</p></sec><sec id="s2-5"><title>Data Extraction</title><p>Key study characteristics were extracted, including bibliographic details (title, authors, year, outlet or conference), study type, and specific information outlined in the data collection protocol available through our FAIR repository [<xref ref-type="bibr" rid="ref27">27</xref>]. These included the study aim, LLM model and access modality, fine-tuning approach, characteristics of the CAs (name, form, role, purpose, and design), deployment context, interactional structure, software used, type of physical or cognitive activity, and prompt features. All data were independently reviewed by 2 authors. Discrepancies were resolved through discussion to ensure consistency and accuracy.</p></sec><sec id="s2-6"><title>Data Synthesis</title><p>To synthesize the collected data, we used both descriptive and qualitative approaches. Descriptive statistics were used to summarize key study characteristics, including publication year, country of study, study type and design, interaction modalities, and targeted domains (eg, PA and cognitive training). These metrics provided insights into the distribution and focus areas of existing research involving LLM-based CAs in the context of PA and cognitive training.</p><p>We used a framework analysis to synthesize findings from the included studies [<xref ref-type="bibr" rid="ref33">33</xref>]. Two authors systematically extracted and charted textual data related to the design, use, and evaluation of the interventions. This charting focused on key domains, including the specific prompts used to guide the CAs and the types of outcomes measured (both qualitative and quantitative). This systematic approach allowed us to identify recurring patterns and emerging themes within the evidence base.</p><p>All conflicts or uncertainties in data interpretation were resolved collaboratively between at least two reviewers.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>To ensure the utmost transparency and facilitate reproducibility, key research artifacts, specifically the full database search queries, the raw and consolidated data extraction datasets, and the R Markdown notebook used for quantitative analysis, as well as our framework analysis results have been publicly archived, adhering to the FAIR principles and recommendations provided by Niksirat et al [<xref ref-type="bibr" rid="ref34">34</xref>]. The same supplementary material content is also available in the <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Search and Screening Results</title><p>The initial database search yielded 833 records, with 386 (46.34%) from Scopus, 312 (37.45%) from Web of Science, 105 (12.6%) from PubMed, 22 (2.64%) from IEEE Xplore, and 8 (0.96%) from ACM DL. After removing 476 (57.14%) duplicates using Covidence, 357 (42.86%) records remained for title and abstract screening. Of these, 330 (92.44%) were excluded based on the inclusion criteria. The remaining 27 (7.56%) full-text articles were assessed for eligibility, resulting in the exclusion of 17 (62.96%) studies due to reasons such as being out of scope (n=9, 52.94%), wrong study design (n=3, 17.64%), or irrelevant outcomes, interventions, or indications. Ultimately, 10 (37.04%) studies met all eligibility criteria and were included in the final review (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The dataset resulting from our analysis is available as supplementary material through our FAIR repository [<xref ref-type="bibr" rid="ref27">27</xref>].</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The PRISMA (Preferred Reporting Items for Systematic reviews and Meta-Analyses) flowchart, provided by Covidence, with the search and the selection process of the studies included in our review. ACM: Association for Computing Machinery; IEEE: Institute of Electrical and Electronics Engineers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e80123_fig01.png"/></fig></sec><sec id="s3-2"><title>Domains of Use and Contextual Focus</title><p>The 10 included studies explored a range of well-being applications, with a strong emphasis on PA (n=7; [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]), and fewer focusing on cognitive training (n=3; [<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref44">44</xref>]). These interventions aimed to support behaviors such as exercise adherence, task planning, reminiscence, and memory stimulation. Most were delivered via web platforms (n=7; [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]), followed by mobile apps (n=2; [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]), messaging platforms (n=1; [<xref ref-type="bibr" rid="ref41">41</xref>]), or robotic embodiments (n=1; [<xref ref-type="bibr" rid="ref44">44</xref>]).</p><p>Several studies designed agents that adapted their responses to user input, preferences, or contexts&#x2014; such as tailoring memory prompts, adjusting training plans, or offering reminders. In cognitive domains, agents facilitated engagement by drawing on familiar content (eg, book discussions, life stories), whereas in PA, personalization was often linked to goal tracking or motivation cues.</p><p>Most studies followed an empirical (n=7; [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]) or design-oriented (n=3; [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]) methodology, reflecting a pragmatic emphasis on system development and usability testing over theoretical modeling. However, only a few studies offered robust outcome measures.</p><p>These findings comprehensively address RQ1 by illustrating the primary domains (PA and cognitive training) and specific intervention goals (eg, exercise adherence, memory stimulation) where LLM-based CAs are currently being applied.</p></sec><sec id="s3-3"><title>Social Roles and Interaction Patterns</title><p>LLM-based CAs were designed with varied social roles, which ranged in both functionality and relational framing. The most common role was that of a personal coach [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref41">41</xref>], emphasizing directive and motivational engagement. Other roles included companions [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>], assistants [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref44">44</xref>], counselors [<xref ref-type="bibr" rid="ref38">38</xref>], experts [<xref ref-type="bibr" rid="ref42">42</xref>], and recommender systems [<xref ref-type="bibr" rid="ref39">39</xref>] (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>The distribution of the roles that were attributed to the large language model&#x2013;based conversational agents in the reviewed studies, demonstrating that these conversational agents were predominantly being used as coaches [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref41">41</xref>] but also involved as companions [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>] and other more specific roles such as assistants [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref44">44</xref>], counselor [<xref ref-type="bibr" rid="ref38">38</xref>], expert [<xref ref-type="bibr" rid="ref42">42</xref>], and recommender systems [<xref ref-type="bibr" rid="ref39">39</xref>].</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Role</td><td align="left" valign="bottom">Reviewed articles</td></tr></thead><tbody><tr><td align="left" valign="top">Personal coach</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]</td></tr><tr><td align="left" valign="top">Companion</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top">Scaffolding expert</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref42">42</xref>]</td></tr><tr><td align="left" valign="top">Recommender system</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref39">39</xref>]</td></tr><tr><td align="left" valign="top">Patient counselor</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref38">38</xref>]</td></tr><tr><td align="left" valign="top">Digital assistant</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top">Medical assistant</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr></tbody></table></table-wrap><p>Despite this diversity, dialogue patterns remained limited: most systems were user initiated (n=5; [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]), with only [<xref ref-type="bibr" rid="ref44">44</xref>] and [<xref ref-type="bibr" rid="ref40">40</xref>] supporting shared initiative. Communication was primarily text-based [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], although 4 studies implemented voice interaction [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref44">44</xref>].</p><p>Another consistent pattern across all studies was the use of dyadic interaction models; indeed, the CA was always designed to interact with a single user at a time. No studies explored group dynamics, multiuser interfaces, or collaborative scenarios involving multiple stakeholders. This highlights a current boundary in the design space, where LLM-based CAs are predominantly framed as personalized, one-to-one agents rather than social actors embedded in broader systems or communities.</p></sec><sec id="s3-4"><title>Technological Design and Functional Integration</title><p>Technological integration centered around popular LLMs (<xref ref-type="table" rid="table2">Table 2</xref>), such as GPT-4.0 (n=5; [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]) and GPT-3.5 (n=4; [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]), with occasional use of alternatives such as Google Bard [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref40">40</xref>], Mistral [<xref ref-type="bibr" rid="ref35">35</xref>], and Llama [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. These models were most commonly accessed via web-based interfaces, with several studies directly using web clients such as ChatGPT&#x2019;s web platform [<xref ref-type="bibr" rid="ref45">45</xref>], and some others integrated the models into mobile applications. A smaller subset of studies incorporated multimodal features, such as voice interaction, embodied agents (eg, the EVA robot [<xref ref-type="bibr" rid="ref46">46</xref>]), or context-aware elements such as memory recall cues, directly detailing the additional technologies and design features specified in RQ3.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>A reporting of the different large language models used in the reviewed studies with a main use of GPT-4.0 (n=5; [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]), GPT-3.5 (n=4; [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]), and Google Bard [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref40">40</xref>] for the closed-source models, in addition to the mention of open-weight models, such as Mistral [<xref ref-type="bibr" rid="ref43">43</xref>] and Llama [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref43">43</xref>].</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Developer, model, and variant</td><td align="left" valign="bottom">Reviewed articles</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">OpenAI</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ChatGPT</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-3.5</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-3.5-turbo</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref42">42</xref>]</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4.0</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NA</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Whisper</td><td align="char" char="." valign="top">[<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top" colspan="2">Google</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Bard</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NA</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]</td></tr><tr><td align="left" valign="top" colspan="2">Meta</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref35">35</xref>]</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2-13B</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref43">43</xref>]</td></tr><tr><td align="left" valign="top" colspan="2">Mistral.ai</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mistral</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>7B</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref43">43</xref>]</td></tr></tbody></table></table-wrap><p>A cross-cutting category was related to the use of engagement and motivation strategies, including adaptive scaffolding, goal reminders, or user-personalized prompts. While some studies integrated these intentionally, others relied on the natural language capabilities of LLMs to simulate engagement (eg, responding conversationally or using humor). Only a subset of studies reported evaluating these engagement-related elements in a structured way, while others did not include specific assessments of these components.</p><p>Finally, many studies highlighted technical or operational challenges. These included LLM response delays, platform limitations, and concerns over content appropriateness or factuality. Several studies used semiautomated systems due to constraints in LLM access or stability. Moreover, the lack of standardized outcome measures was a consistent limitation: although most studies reported user satisfaction or feasibility, few provided behavioral or cognitive metrics tied to intervention efficacy.</p></sec><sec id="s3-5"><title>Prompting as Design: Framing Roles, Personalization, and Interaction</title><p>In addition to model selection and platform deployment, the reviewed studies revealed a critical design layer in the form of prompt engineering&#x2014;that is, the crafting of textual inputs that guide the behavior, tone, and identity of LLM-based CAs. Although rarely emphasized as a formal methodology, prompt design emerged as a powerful mechanism through which researchers shaped the agent&#x2019;s functional logic, social role, and interpersonal dynamics. The resulting framework analysis of the prompt strategies observed across the included studies highlights 4 key patterns in how prompts were used to construct agent behavior and interaction style (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>A summary of the different categories that emerged when analyzing the provided prompts in the reviewed studies (7 papers provided prompts out of the 10 analyzed).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Studies</td></tr></thead><tbody><tr><td align="left" valign="top">Instructional and informational requests</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top">Role-based identity assignment</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top">Scenario-based personalization</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top">Task-oriented dialogue support</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr></tbody></table></table-wrap><p>Across the studies, prompts were not generic queries but scripted scenarios or instructions that encoded interactional intent. In many cases, this took the form of scenario-based personalization. For instance, Favela et al [<xref ref-type="bibr" rid="ref44">44</xref>] framed interactions within dementia care routines, embedding the agent in emotionally significant and context-sensitive dialogue.</p><p>Prompting also played a decisive role in constructing the social identity of the agent. Multiple studies used explicit role-based instructions, instructing the LLM: &#x201C;You are a book club host,&#x201D; &#x201C;You are a healthcare assistant,&#x201D; or &#x201C;You are a caregiver.&#x201D; These role assignments acted as social anchors, shaping how the CA would behave&#x2014;more directive as a coach [<xref ref-type="bibr" rid="ref36">36</xref>], more empathetic as a companion [<xref ref-type="bibr" rid="ref44">44</xref>], or more informative as a recommender system [<xref ref-type="bibr" rid="ref39">39</xref>]. These framing devices reflect a broader reliance on prompting to simulate relational presence, particularly in the absence of embodied or affective sensing.</p><p>In terms of functionality, prompts often scaffolded task-oriented dialogues. For example, Hu et al [<xref ref-type="bibr" rid="ref42">42</xref>] embedded conditional logic into its prompt to simulate a grocery shopping task for cognitive assessment. Rather than scripting full interaction flows, these designs leveraged the LLM&#x2019;s interpretive flexibility, using natural language to create interactive, multistep tasks without formal programming and prompting thus served as a low-code interface for designing agent behavior.</p><p>Some prompts explicitly addressed affective tone and ethical conduct. Favela et al [<xref ref-type="bibr" rid="ref44">44</xref>] instructed the CA not to &#x201C;talk like a child&#x201D; when interacting with older adults and to remain &#x201C;patient and respectful.&#x201D; Such affect-aware framing suggests a growing awareness that prompts are not only functional but also relational instruments, capable of shaping the user&#x2019;s emotional experience.</p><p>Importantly, all studies used LLMs in dyadic contexts, with prompts structured for one-on-one interaction. No study designed prompts to support group conversation, multiuser turn-taking, or collective memory tasks. This reflects a current boundary in the field: despite LLMs&#x2019; flexible dialogue capabilities, they are still being operationalized primarily as personalized single-user agents.</p><p>Of the 10 reviewed studies, 3 did not provide the prompts in the text or the supplementary materials. One additional study [<xref ref-type="bibr" rid="ref35">35</xref>] did not use prompting to shape the agent&#x2019;s behavior as a CA but instead framed the task as a reasoning exercise: the LLM was given a user profile and asked to infer likely future decisions from that perspective.</p><p>Taken together, the analysis of prompt strategies highlights that prompting is not just a technical necessity but a central design practice. Whether used to construct social roles, personalize content, or manage conversation flow, prompts serve as the invisible scaffolding behind agent behavior. Yet, few studies evaluated or iterated on prompt effectiveness, suggesting a need for more systematic approaches to prompt design and testing in future work.</p><p>The analysis of prompt strategies highlights that prompting is not just a technical necessity but a central design practice. Whether used to construct social roles (RQ2), personalize content, or manage conversation flow (both enhancing effectiveness, RQ3), prompts serve as the invisible scaffolding behind agent behavior.</p></sec><sec id="s3-6"><title>Reported Outcomes: Perception, Behaviors, and Evaluation Gaps</title><p>The included studies reported a range of qualitative and quantitative outcomes, offering insights into both user experience and the effectiveness of LLM-based CAs. A framework analysis of these outcomes revealed five dominant categories: perceived usefulness, user engagement, content quality, behavioral impact, and a notable lack of quantitative evaluation. These categories illustrate how CAs are currently being evaluated and highlight where the evidence remains limited or anecdotal (<xref ref-type="table" rid="table4">Table 4</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>A summary of the different categories that emerged from the outcomes&#x2019; analysis in the reviewed papers.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Studies</td></tr></thead><tbody><tr><td align="left" valign="top">Behavioral impact</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]</td></tr><tr><td align="left" valign="top">Content quality</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]</td></tr><tr><td align="left" valign="top">Perceived usefulness</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]</td></tr><tr><td align="left" valign="top">User engagement</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]</td></tr><tr><td align="left" valign="top">Lack of quantitative evaluation</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]</td></tr></tbody></table></table-wrap><p>Perceived usefulness was a recurring category in the reviewed papers. For instance, participants in the study by Hu et al [<xref ref-type="bibr" rid="ref42">42</xref>] described the agent as helpful and easy to use, indicating a positive perception of its utility in their tasks.</p><p>Similarly, user engagement was frequently reported as a key outcome. Several studies used interaction metrics as a proxy for engagement; Favela et al [<xref ref-type="bibr" rid="ref44">44</xref>] noted that users sustained conversations lasting for over 10 minutes, suggesting a naturalistic and engaging interaction. Hu et al [<xref ref-type="bibr" rid="ref42">42</xref>] documented consistent use of specific features, such as recommendations, while Sun et al [<xref ref-type="bibr" rid="ref41">41</xref>] found that exposure to the agent was a significant predictor of engagement. The latter study also identified specific design elements, such as humor, as potential enhancers of engagement, although their direct motivational impact was not conclusively determined.</p><p>The content quality of the agent&#x2019;s output emerged as a dual-focused category, with studies reporting both strengths and significant weaknesses. On the positive side, Washif et al [<xref ref-type="bibr" rid="ref36">36</xref>] found that an agent&#x2019;s exercise recommendations were consistent with standard plans. Bak and Chin [<xref ref-type="bibr" rid="ref35">35</xref>] reported that LLMs could generate more stage-appropriate health information when provided with user profiles that included clear goals. However, the same study noted limitations in the recommendations for users in certain stages of behavior change. Some studies reported several limitations related to the accuracy, reliability, or personalization of the agents&#x2019; responses. Pugliese et al [<xref ref-type="bibr" rid="ref38">38</xref>], for example, found that while responses were understandable, not all information provided was reliable or sufficiently personalized.</p><p>A smaller subset of studies attempted to measure behavioral impact, although often with preliminary or indirect indicators. Bak and Chin [<xref ref-type="bibr" rid="ref35">35</xref>] evaluated the potential for LLMs to address different stages of the transtheoretical model, highlighting that the models tended to favor certain strategies over others without providing clear reasons. While not finding significant changes in motivation, Sun et al [<xref ref-type="bibr" rid="ref41">41</xref>] did demonstrate that the inclusion of humor had a measurable effect on participants&#x2019; PA, pointing to a potential, albeit subtle, behavioral influence.</p><p>Finally, a critical finding was the widespread lack of quantitative evaluation. While a few studies reported quantitative metrics such as conversation time [<xref ref-type="bibr" rid="ref44">44</xref>] or feature usage [<xref ref-type="bibr" rid="ref42">42</xref>], these were the exception. Many papers explicitly acknowledged that their evaluation was exploratory or provided no quantitative data at all [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref43">43</xref>].</p><p>Overall, the limited application of standardized outcome measures suggests that the current evidence base remains preliminary. While studies frequently reported positive user perceptions, few included validated behavioral or cognitive assessments to substantiate claims of effectiveness. This constrains the ability to draw firm conclusions regarding both the impact of these systems (RQ1) and the design choices that shape their performance (RQ3).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This scoping review examined how LLM-based CAs have been applied to support well-being, with a focus on PA and cognitive training. Through framework analysis, prompt categorization, and evaluation of reported outcomes, we identified patterns in how these systems are designed, deployed, and assessed. In the following sections, we interpret the findings across three key areas: application contexts, role construction, and design and evaluation strategies.</p><p>First, in terms of application contexts, our results show that LLM-based CAs are primarily deployed in interventions related to PA, where they function as digital coaches offering motivational prompts, personalized planning, and behavioral reinforcement [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. These roles align with traditional coaching models that emphasize goal setting, encouragement, and self-monitoring [<xref ref-type="bibr" rid="ref47">47</xref>]. In contrast, cognitive training applications remain limited and exploratory. The 2 studies in this domain focused on reminiscence [<xref ref-type="bibr" rid="ref44">44</xref>] and task scaffolding [<xref ref-type="bibr" rid="ref42">42</xref>] rather than delivering structured cognitive exercises grounded in validated protocols.</p><p>This pattern suggests that PA interventions may present a more immediate design fit for LLM-based agents, particularly because motivational dialogue can be framed using general-purpose language generation without requiring deep domain modeling, thereby furthering our understanding of RQ1. Conversely, cognitive interventions&#x2014;especially those targeting impairments&#x2014;require higher precision, domain knowledge, and ethical sensitivity, which current prompt-based implementations may struggle to provide. Moreover, most applications in both domains rely on static personalization, often configured during initial sessions, rather than dynamically adapting to user behavior or outcomes over time.</p><p>These findings highlight an underexplored opportunity to use LLM-based agents for more structured cognitive support. Future work should explore how these systems can be embedded in adaptive frameworks that respond to longitudinal user behavior, particularly in contexts such as memory training or executive function support. Additionally, the absence of evidence-based cognitive training protocols underscores the need for interdisciplinary collaboration between HCI, cognitive neuroscience, and clinical design.</p><p>Our review shows that LLM-based CAs predominantly rely on prompt-engineered instructions to instantiate social roles such as &#x201C;coach,&#x201D; &#x201C;companion,&#x201D; or &#x201C;caregiver.&#x201D; This design choice enables rapid prototyping; however, it also exposes a deeper structural limitation. Indeed, LLMs do not maintain stable personas across extended interactions [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Recent empirical work demonstrates that persona adherence degrades during multiturn dialogue, with models gradually drifting away from assigned psychological profiles or communicative styles. Bhandari et al [<xref ref-type="bibr" rid="ref49">49</xref>] show that LLMs frequently lose alignment with Big Five trait configurations over the course of dyadic conversations, even when such traits are explicitly embedded at initialization, highlighting inconsistencies in sustained personality expression. Similar instability has been documented in emotional support settings, where personas influence strategy use but undergo measurable shifts in emotionality and extraversion as conversations unfold [<xref ref-type="bibr" rid="ref50">50</xref>]. Once a user diverges from the scripted interaction, the role coherence can collapse, exposing the user to potential agent hallucinations [<xref ref-type="bibr" rid="ref51">51</xref>]. This observation is consistent with findings in the LLM literature showing that persona conditioning degrades over time without architectural support (eg, memory, state tracking) [<xref ref-type="bibr" rid="ref52">52</xref>].</p><p>This fragility aligns with theoretical perspectives on role-play in LLMs, which conceptualize dialogue agents as enacting simulacra that lack internal persistence. Shanahan et al [<xref ref-type="bibr" rid="ref53">53</xref>] argue that LLMs &#x201C;role-play&#x201D; characters by following statistical patterns rather than maintaining grounded identities; consequently, persona continuity is inherently brittle in the absence of architectural mechanisms such as memory or state tracking. Our findings similarly reflect that current well-being&#x2013;oriented CAs rarely incorporate such mechanisms, relying instead on static instructions that do not support evolving or contextually reinforced identities.</p><p>Furthermore, research in HCI and communication science indicates that personas&#x2014;especially when used to guide social interaction&#x2014;impact user perception, engagement, and trust. Controlled experiments with embodied LLM agents show that manipulating personality traits (eg, introversion vs extraversion) significantly affects social evaluations, emotional experience, and behavioral engagement [<xref ref-type="bibr" rid="ref54">54</xref>]. However, these effects depend on the consistency and credibility of the persona. Complementary evidence reveals that AI-generated personas often appear stereotypical or insufficiently nuanced compared to human-crafted ones, raising concerns about whether LLM personas authentically capture user diversity or complexity [<xref ref-type="bibr" rid="ref55">55</xref>]. Similarly, work on demographic persona prompting demonstrates that LLMs may reflect demographic biases or fail to accurately maintain demographic-specific viewpoints unless tightly constrained [<xref ref-type="bibr" rid="ref48">48</xref>].</p><p>This lack of continuity is critical because it undermines the conditions under which users treat agents as social actors. Nass and Moon [<xref ref-type="bibr" rid="ref56">56</xref>] demonstrated that humans apply social norms to computer agents when those agents maintain a coherent identity and consistent interpersonal behavior&#x2014;qualities that foster deeper engagement and trust. These effects rely on interactional consistency and memory to simulate &#x201C;mindfulness&#x201D; and sustained social presence, both of which are often absent in current LLM-based implementations. This challenge was also highlighted by Pataranutaporn et al [<xref ref-type="bibr" rid="ref57">57</xref>], who note that although AI-generated characters can simulate highly personalized roles (eg, mentor and therapist), their relational integrity is fragile unless backed by memory systems and contextual continuity mechanisms.</p><p>Taken together, these insights suggest that prompt-based social role construction is brittle and insufficient for sustained engagement. To address this, future systems should incorporate persistent memory mechanisms, state awareness, or hybrid logic layers to reinforce social cues across interactions. Moreover, there is a clear need for empirical evaluation of persona coherence&#x2014;an area that remains largely untested despite its centrality to trust, compliance, and user satisfaction in relational agents.</p><p>The third major theme emerging from our findings concerns the profound challenge that current design practices pose to scientific rigor. LLM-based CAs reflect a tension between design flexibility and methodological fragmentation. On the one hand, the ability to craft prompts as low-code design primitives allows for rapid customization and iterative prototyping. Developers can create context-sensitive interactions by embedding goals, roles, and emotional cues directly into natural language instructions. However, as recent work has shown, this flexibility often comes at the cost of consistency, transparency, and evaluative rigor. Large-scale methodological reviews increasingly describe LLM research ecosystems as fragmented and underspecified, noting that the lack of shared standards for documenting prompts, configurations, and evaluation pipelines produces substantial barriers to comparability and scientific accumulation [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>].</p><p>For instance, Hanauer et al [<xref ref-type="bibr" rid="ref60">60</xref>] found that a large proportion of LLM-driven clinical studies failed to report basic implementation details, such as model version, parameter settings, or the timing of usage&#x2014;critical factors that undermine reproducibility. Similarly, Zamfirescu-Pereira et al [<xref ref-type="bibr" rid="ref61">61</xref>] demonstrate that non-AI experts often struggle to design effective prompts and rarely document prompt rationale, iterations, or failures. These studies highlight that while prompt engineering lowers technical barriers, it introduces new challenges related to reproducibility, replicability, and responsible design practice. Prompt design is rarely subjected to empirical testing, and its impact on system behavior is often undocumented or informally evaluated&#x2014;a concern echoed in psychological research, where Demszky et al [<xref ref-type="bibr" rid="ref62">62</xref>] warn that LLM-based interventions frequently lack theoretical grounding or validation against established constructs.</p><p>This lack of formalization extends directly to evaluation practices. Few studies attempted to link agent interaction with behavioral or cognitive outcomes, and fewer still used validated instruments. Overall, the variability and limited rigor of outcome measures across studies suggest that much of the current work remains at a proof-of-concept level, with an emphasis on feasibility and system development rather than validated behavioral or cognitive outcomes. These observations are consistent with findings from Shool et al [<xref ref-type="bibr" rid="ref63">63</xref>], who reviewed more than 700 LLM studies in clinical medicine and found that most relied on ad hoc performance indicators&#x2014;such as accuracy or readability&#x2014;while neglecting more robust, psychometrically validated tools. These observations echo broader concerns in recent LLM evaluation scholarship regarding the limitations of current benchmarks and metrics, which insufficiently capture meaningful, real-world performance or safety [<xref ref-type="bibr" rid="ref58">58</xref>].</p><p>These combined observations on design and evaluation raise important concerns about the current maturity of LLM-based CA research in well-being domains. To strengthen the evidentiary base, future work should adopt mixed methods designs with validated outcome measures and comparative baselines. Furthermore, prompt design artifacts, agent configurations, and transcripts should be published or shared where possible to support reproducibility and transparency in this rapidly evolving design space.</p><p>Finally, beyond the challenges of prompt transparency, a deeper impediment to reproducibility in LLM-based interventions lies in the widespread reliance on proprietary, continuously updated models. These systems&#x2014;such as OpenAI&#x2019;s GPT series and Google&#x2019;s Gemini&#x2014;are fundamentally nontransparent black boxes, a characteristic repeatedly highlighted in the literature as a central obstacle to accountability and scientific verification. Existing literature emphasizes that LLMs&#x2019; internal mechanisms remain opaque even to expert users, complicating efforts to understand or trace how outputs are generated (eg, their &#x201C;algorithmic opacity&#x201D;) and limiting the ability to contest or replicate results [<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref65">65</xref>].</p><p>A critical consequence of this opacity is model drift. Indeed, proprietary LLMs are routinely updated without version-locking or archival access, meaning that the same prompt issued weeks or months apart may yield measurably different outputs. Such evolving behavior has been noted as incompatible with basic scientific principles of repeatability and falsifiability, as researchers cannot access, &#x201C;freeze,&#x201D; or independently inspect prior states of the model used in their studies [<xref ref-type="bibr" rid="ref66">66</xref>]. This challenge is distinct from issues of explainability; even perfect prompt documentation cannot compensate for the fact that the underlying computational pathway is inaccessible and mutable.</p><p>The inability to audit or update underlying training data compounds this problem. While transparency frameworks increasingly stress the importance of auditability (ie, the ability to identify what data or processes contributed to an output), current proprietary LLMs rarely enable such inspection, creating structural barriers to verifying results or correcting errors [<xref ref-type="bibr" rid="ref66">66</xref>]. Scholars further warn that focusing solely on explainability can obscure the more pressing practical issue: users lack sufficient clarity about how these systems operate and what their limitations are, further undermining reproducibility across contexts [<xref ref-type="bibr" rid="ref67">67</xref>].</p><p>Given these constraints, researchers using proprietary LLMs should explicitly acknowledge the inherent limitations to reproducibility and document, at minimum, the date range of model access, application programming interface version information (if any), and all implementation details that can feasibly be reported. Although such documentation cannot fully compensate for the absence of model version stability, it substantially improves transparency and allows future researchers to contextualize observed outputs within the dynamic evolution of the model.</p><p>In addition to transparency constraints, reproducibility poses an even more fundamental challenge for LLM-based research. Recent evaluations across clinical and information-retrieval domains reveal that proprietary LLMs exhibit intrinsic output instability: even when prompts, inputs, and contexts are held constant, model responses vary in ways that cannot be fully controlled or accounted for by researchers [<xref ref-type="bibr" rid="ref68">68</xref>,<xref ref-type="bibr" rid="ref69">69</xref>]. This instability reflects not simply stochastic sampling, but deeper properties of opaque, continuously optimized systems whose internal states and inference pathways are inaccessible. As a result, reproducibility failures arise even before model drift is considered, compounding the challenge introduced by unannounced backend updates. Together, these characteristics make LLMs fundamentally different from traditional research instruments: they cannot be frozen, independently audited, or deterministically rerun. Consequently, LLM-based studies must treat reproducibility not as a procedural hurdle but as a structural limitation of the technology, necessitating explicit acknowledgment and meticulous reporting of model versions, access dates, prompt configurations, and variability observed during experimentation.</p></sec><sec id="s4-2"><title>Limitations</title><p>While this scoping review provides a comprehensive mapping of LLM-based CAs in PA and cognitive training, it is important to acknowledge several limitations that shape the interpretation and generalizability of our findings.</p><p>As a scoping review, our primary aim was to map the breadth of existing literature rather than to conduct a deep synthesis or formal quality appraisal of individual studies. Consequently, we did not formally assess the methodological quality or risk of bias of the included studies, meaning our review cannot make definitive statements about the robustness of the evidence or the causal effectiveness of the interventions. Similarly, we did not perform a meta-analysis or other quantitative aggregation of outcomes due to the inherent heterogeneity of study designs, interventions, and outcome measures, with our reported outcomes primarily consisting of qualitative summaries. While providing a comprehensive overview, the broad nature of a scoping review also implies that specific nuances within individual studies or particular application contexts might not have been explored in exhaustive detail.</p><p>Our search strategy, while comprehensive within its defined parameters, was subject to certain constraints. Searches were limited to WoS and Elsevier Scopus; thus, relevant studies published in other databases, gray literature, or conference proceedings not indexed in these sources might have been missed. The effectiveness of our search was also dependent on the chosen keyword combinations (K1, K2, K3, and K4); although these were developed through preliminary scans and expert consultation, it is possible that alternative terminology or emerging concepts related to LLM-based CAs, PA, or cognitive training were not fully captured. Furthermore, our review was restricted to articles published in English, potentially excluding pertinent research published in other languages. While justified by the emergence of LLMs, the time frame restriction to publications between January 2018 and December 2024 means that earlier foundational work or very recent developments (post-2024) were not included. Finally, despite a comprehensive search, the small sample size of only 10 studies meeting our stringent eligibility criteria reflects the nascent stage of research in these specific domains, inherently limiting the generalizability and robustness of our findings.</p><p>Our analysis was based solely on the information reported in the full-text articles, which means that insufficient reporting in original studies could lead to incomplete data extraction or synthesis. Additionally, while our framework analysis of qualitative data, such as prompt strategies and reported outcomes, was systematic, it inherently involved a degree of subjective interpretation by the reviewers. Although consensus was reached through discussion to mitigate this, individual biases cannot be entirely eliminated.</p><p>This review specifically identified challenges inherent to LLM research that also serve as limitations to its own reproducibility. A significant proportion of the identified LLM-based CAs used proprietary models (eg, specific versions of GPT and Bard), which operate as &#x201C;black boxes&#x201D; with undisclosed architectures, training data, and update cycles. This means that replicating the exact behavior or outputs of these agents is inherently challenging, if not impossible, as the underlying model can change over time (&#x201C;model drift&#x201D;) even with the same prompt, representing a fundamental lack of version control and transparency within proprietary LLMs. Furthermore, while prompt engineering emerged as a critical design practice, our review found that prompts were often inconsistently documented or treated informally within the studies, impeding the ability of other researchers to precisely reproduce the designed agent behaviors or verify findings.</p><p>In conclusion, while this scoping review provides valuable insights into the emerging field of LLM-based CAs for well-being, these limitations should be considered when interpreting our findings and inform future research endeavors aiming for greater methodological rigor and transparency in this rapidly evolving domain.</p></sec><sec id="s4-3"><title>Future Works and Research Directions</title><p>This scoping review has systematically mapped the current landscape of LLM-based CAs for PA and cognitive training, identifying several critical gaps and promising avenues for future investigation. Building upon our findings, we propose the following key directions for future research.</p><sec id="s4-3-1"><title>Advancing LLM-Based CA Applications and Design</title><p>Given the nascent and exploratory nature of LLM-based CAs in cognitive training, future work should prioritize the development and rigorous evaluation of interventions grounded in validated cognitive protocols. This includes exploring their utility for specific cognitive functions beyond reminiscence and task scaffolding, such as executive functions or attention, potentially in clinical populations. Our review also highlighted a predominant focus on dyadic, one-to-one interactions, with &#x201C;coaching&#x2019;' being a commonly adopted role. Future research should thus investigate more diverse and complex social dynamics, explicitly exploring social setups such as companionship, alongside multiuser interfaces, group-based interventions, or integrating LLM-based CAs into broader community support systems for collective well-being. Furthermore, the observed fragility of LLM-based CA social roles underscores the need for designing systems with persistent memory mechanisms, state awareness, and adaptive logical layers. Research should explore how these structural reinforcements can enable more consistent, trustworthy, and long-term therapeutic alliances or human-agent relationships.</p></sec><sec id="s4-3-2"><title>Strengthening Methodological Rigor and Transparency</title><p>There is a critical need for more robust quantitative evaluation of LLM-based CA effectiveness. Future studies should move beyond perceived usefulness and user satisfaction, using validated behavioral and cognitive outcome measures. This necessitates the adoption of mixed methods designs, ideally with control groups and longitudinal follow-up, to ascertain sustained impact. Our findings also underscore that prompt engineering is a critical, yet inconsistently documented, design practice. Future research should prioritize developing and adopting standardized methodologies for prompt design, iteration, and evaluation. Furthermore, the systematic and open sharing of detailed prompt structures, including parameters such as temperature or top-p, is essential for enabling the reproducibility of LLM-based CA behaviors, as suggested by the LLM guidelines [<xref ref-type="bibr" rid="ref70">70</xref>] project initiated by Wagner et al [<xref ref-type="bibr" rid="ref71">71</xref>]. Addressing reproducibility with proprietary LLMs is another fundamental challenge, as their inherent variability and black box nature (eg, model drift over time) present a significant impediment to replication. Future research must explicitly acknowledge these limitations, and when proprietary models are used, authors should meticulously document the exact model version, application programming interface details, and precise dates of interaction. The community should also explore and contribute to research using open-source LLMs, where version control and long-term reproducibility can be more readily ensured. Finally, while some studies share data, practices remain inconsistent. Future work should fully embrace comprehensive data sharing, adhering to FAIR principles. Specifically, for LLM-based interventions, this mandates a focus on more detailed and structured prompt documentation, enabling their precise reproduction for verification and future research.</p></sec></sec><sec id="s4-4"><title>Conclusions</title><p>This scoping review meticulously charted the nascent field of LLM-based CAs in PA and cognitive training. Our synthesis revealed a dynamic yet underexplored landscape, marked by a strong emphasis on PA coaching and a reliance on dyadic, prompt-driven interactions. While these agents demonstrate initial promise in engagement and perceived usefulness, a critical evaluation points to significant gaps in methodological rigor and comprehensive outcome assessment. Crucially, the unique challenges posed by proprietary LLMs and the current lack of structured prompt sharing emerge as fundamental impediments to reproducibility in this rapidly evolving domain. Addressing these issues through rigorous evaluation and a strong commitment to open science will be paramount to advancing the scientific understanding and responsible deployment of LLM-based CAs for well-being.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>Funding for this research was provided by two sources. The European Commission, through its Horizon research and innovation program, DORIAN GRAY (grant 101156266), provided funding for some of the co-authors&#x2019; salaries and the Covidence platform license. The R&#x00E9;seau de Comp&#x00E9;tence HES-SO Economie &#x0026; Management provided funding that covered the salary of one of the co-authors (project name SEP4PA).</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CA</term><def><p>conversational agent</p></def></def-item><def-item><term id="abb3">FAIR</term><def><p>findable, accessible, interoperable, reusable</p></def></def-item><def-item><term id="abb4">HCI</term><def><p>human-computer interaction</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">PA</term><def><p>physical activity</p></def></def-item><def-item><term id="abb7">PRISMA-ScR</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews</p></def></def-item><def-item><term id="abb8">RQ</term><def><p>research question</p></def></def-item><def-item><term id="abb9">WoS</term><def><p>Clarivate Web of Science</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Kraut</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Mohr</surname><given-names>DC</given-names> </name></person-group><article-title>Systematic review and meta-analysis of AI-based conversational agents for promoting mental health and well-being</article-title><source>NPJ Digit Med</source><year>2023</year><month>12</month><day>19</day><volume>6</volume><issue>1</issue><fpage>236</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00979-5</pub-id><pub-id pub-id-type="medline">38114588</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maher</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Curtis</surname><given-names>RG</given-names> </name><name name-style="western"><surname>Short</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>KJ</given-names> </name></person-group><article-title>A physical activity and diet program delivered by artificially intelligent virtual health coach: proof-of-concept study</article-title><source>JMIR mHealth uHealth</source><year>2020</year><month>07</month><day>10</day><volume>8</volume><issue>7</issue><fpage>e17558</fpage><pub-id pub-id-type="doi">10.2196/17558</pub-id><pub-id pub-id-type="medline">32673246</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kocielnik</surname><given-names>R</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>L</given-names> </name><name name-style="western"><surname>Avrahami</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>G</given-names> </name></person-group><article-title>Reflection companion: a conversational system for engaging users in reflection on physical activity</article-title><source>Proceedings of the ACM Interactive Mobile Wearable and Ubiquitous Technologies</source><year>2018</year><volume>2</volume><issue>2</issue><fpage>70</fpage><pub-id pub-id-type="doi">10.1145/3214273</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Oh</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Lange</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Fukuoka</surname><given-names>Y</given-names> </name></person-group><article-title>Artificial intelligence chatbot behavior change model for designing artificial intelligence chatbots to promote physical activity and a healthy diet: viewpoint</article-title><source>J Med Internet Res</source><year>2020</year><month>09</month><day>30</day><volume>22</volume><issue>9</issue><fpage>e22845</fpage><pub-id pub-id-type="doi">10.2196/22845</pub-id><pub-id pub-id-type="medline">32996892</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cowan</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Pantidi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Coyle</surname><given-names>D</given-names> </name><etal/></person-group><article-title>What can I help you with?&#x201D;: infrequent users&#x2019; experiences of intelligent personal assistants</article-title><conf-name>Proceedings of the 19th International Conference on Human-Computer Interaction with Mobile Devices and Services</conf-name><conf-date>Sep 4-7, 2017</conf-date><conf-loc>Vienna, Austria</conf-loc><fpage>1</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1145/3098279.3098539</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sch&#x00F6;bel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schmitt</surname><given-names>A</given-names> </name><name name-style="western"><surname>Benner</surname><given-names>D</given-names> </name><name name-style="western"><surname>Saqr</surname><given-names>M</given-names> </name><name name-style="western"><surname>Janson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Leimeister</surname><given-names>JM</given-names> </name></person-group><article-title>Charting the evolution and future of conversational agents: a research agenda along five waves and new frontiers</article-title><source>Inf Syst Front</source><year>2024</year><month>04</month><volume>26</volume><issue>2</issue><fpage>729</fpage><lpage>754</lpage><pub-id pub-id-type="doi">10.1007/s10796-023-10375-9</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>WX</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>K</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 31, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.18223</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>KCC</given-names> </name></person-group><article-title>Towards reasoning in large language models: a survey</article-title><conf-name>Findings of the Association for Computational Linguistics: ACL 2023</conf-name><conf-date>Jul 9-14, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.findings-acl.67</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>When large language models meet personalization: perspectives of challenges and opportunities</article-title><source>World Wide Web</source><year>2024</year><month>07</month><volume>27</volume><issue>4</issue><fpage>42</fpage><pub-id pub-id-type="doi">10.1007/s11280-024-01276-1</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bouneffouf</surname><given-names>D</given-names> </name><name name-style="western"><surname>Landa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jespersen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Corcoran</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cecchi</surname><given-names>G</given-names> </name></person-group><article-title>COMPASS: computational mapping of patient-therapist alliance strategies with language modeling</article-title><source>Transl Psychiatry</source><year>2025</year><volume>15</volume><fpage>166</fpage><pub-id pub-id-type="doi">10.1038/s41398-025-03379-3</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stephanidis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Salvendy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Antona</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Seven HCI grand challenges</article-title><source>Int J Hum Comput Interact</source><year>2019</year><month>08</month><day>27</day><volume>35</volume><issue>14</issue><fpage>1229</fpage><lpage>1269</lpage><pub-id pub-id-type="doi">10.1080/10447318.2019.1619259</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aldenaini</surname><given-names>N</given-names> </name><name name-style="western"><surname>Alslaity</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sampalli</surname><given-names>S</given-names> </name><name name-style="western"><surname>Orji</surname><given-names>R</given-names> </name></person-group><article-title>Persuasive strategies and their implementations in mobile interventions for physical activity: a systematic review</article-title><source>Int J Hum Comput Interact</source><year>2023</year><month>07</month><day>21</day><volume>39</volume><issue>12</issue><fpage>2292</fpage><lpage>2338</lpage><pub-id pub-id-type="doi">10.1080/10447318.2022.2075573</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Matthews</surname><given-names>J</given-names> </name><name name-style="western"><surname>Win</surname><given-names>KT</given-names> </name><name name-style="western"><surname>Oinas-Kukkonen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Freeman</surname><given-names>M</given-names> </name></person-group><article-title>Persuasive technology in mobile applications promoting physical activity: a systematic review</article-title><source>J Med Syst</source><year>2016</year><month>03</month><volume>40</volume><issue>3</issue><fpage>72</fpage><pub-id pub-id-type="doi">10.1007/s10916-015-0425-x</pub-id><pub-id pub-id-type="medline">26748792</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chew</surname><given-names>HSJ</given-names> </name></person-group><article-title>The use of artificial intelligence&#x2013;based conversational agents (chatbots) for weight loss: scoping review and practical recommendations</article-title><source>JMIR Med Inform</source><year>2022</year><month>04</month><day>13</day><volume>10</volume><issue>4</issue><fpage>e32578</fpage><pub-id pub-id-type="doi">10.2196/32578</pub-id><pub-id pub-id-type="medline">35416791</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lyzwinski</surname><given-names>LN</given-names> </name><name name-style="western"><surname>Elgendi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Menon</surname><given-names>C</given-names> </name></person-group><article-title>Conversational agents and avatars for cardiometabolic risk factors and lifestyle&#x2011;related behaviors: scoping review</article-title><source>JMIR mHealth uHealth</source><year>2023</year><month>05</month><day>25</day><volume>11</volume><fpage>e39649</fpage><pub-id pub-id-type="doi">10.2196/39649</pub-id><pub-id pub-id-type="medline">37227765</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Won</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hahm</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name></person-group><article-title>Conversational agents for body weight management: systematic review</article-title><source>J Med Internet Res</source><year>2023</year><month>05</month><day>26</day><volume>25</volume><fpage>e42238</fpage><pub-id pub-id-type="doi">10.2196/42238</pub-id><pub-id pub-id-type="medline">37234029</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Fukuoka</surname><given-names>Y</given-names> </name></person-group><article-title>A systematic review of artificial intelligence chatbots for promoting physical activity, healthy diet, and weight loss</article-title><source>Int J Behav Nutr Phys Act</source><year>2021</year><month>12</month><day>11</day><volume>18</volume><issue>1</issue><fpage>160</fpage><pub-id pub-id-type="doi">10.1186/s12966-021-01224-6</pub-id><pub-id pub-id-type="medline">34895247</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dingler</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kwasnicka</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gong</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oldenburg</surname><given-names>B</given-names> </name></person-group><article-title>The use and promise of conversational agents in digital health</article-title><source>Yearb Med Inform</source><year>2021</year><month>08</month><volume>30</volume><issue>1</issue><fpage>191</fpage><lpage>199</lpage><pub-id pub-id-type="doi">10.1055/s-0041-1726510</pub-id><pub-id pub-id-type="medline">34479391</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Aguilera</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lyles</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Figueroa</surname><given-names>CA</given-names> </name></person-group><article-title>Promoting physical activity through conversational agents: mixed methods systematic review</article-title><source>J Med Internet Res</source><year>2021</year><month>09</month><day>14</day><volume>23</volume><issue>9</issue><fpage>e25486</fpage><pub-id pub-id-type="doi">10.2196/25486</pub-id><pub-id pub-id-type="medline">34519653</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Fruitet</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fouillen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Facque</surname><given-names>V</given-names> </name><name name-style="western"><surname>Chainay</surname><given-names>H</given-names> </name><name name-style="western"><surname>De Chalvron</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tarpin-Bernard</surname><given-names>F</given-names> </name></person-group><article-title>Engaging with an embodied conversational agent in a computerized cognitive training: an acceptability study with the elderly</article-title><conf-name>Proceedings of the ACM International Conference on Multimodal Interaction Companion (ICMI &#x2019;23 Companion)</conf-name><conf-date>Oct 9-13, 2023</conf-date><conf-loc>Paris, France</conf-loc><fpage>359</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.1145/3610661.3616130</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chan</surname><given-names>SWT</given-names> </name><name name-style="western"><surname>Sapkota</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mathews</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Nanayakkara</surname><given-names>S</given-names> </name></person-group><article-title>Prompto: investigating receptivity to prompts based on cognitive load from memory training conversational agent</article-title><source>Proc ACM Interact Mob Wearable Ubiquitous Technol</source><year>2020</year><volume>4</volume><issue>4</issue><fpage>121</fpage><pub-id pub-id-type="doi">10.1145/3432190</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>GH</given-names> </name></person-group><article-title>Exploring the role of engagement and adherence in chatbot-based cognitive training for older adults: memory function and mental health outcomes</article-title><source>Behav Inf Technol</source><year>2025</year><month>06</month><day>15</day><volume>44</volume><issue>10</issue><fpage>2405</fpage><lpage>2417</lpage><pub-id pub-id-type="doi">10.1080/0144929X.2024.2362406</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lauenroth</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ioannidis</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Teichmann</surname><given-names>B</given-names> </name></person-group><article-title>Influence of combined physical and cognitive training on cognition: a systematic review</article-title><source>BMC Geriatr</source><year>2016</year><month>07</month><day>18</day><volume>16</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1186/s12877-016-0315-1</pub-id><pub-id pub-id-type="medline">27431673</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bherer</surname><given-names>L</given-names> </name><name name-style="western"><surname>Erickson</surname><given-names>KI</given-names> </name><name name-style="western"><surname>Liu-Ambrose</surname><given-names>T</given-names> </name></person-group><article-title>A review of the effects of physical activity and exercise on cognitive and brain functions in older adults</article-title><source>J Aging Res</source><year>2013</year><volume>2013</volume><fpage>657508</fpage><pub-id pub-id-type="doi">10.1155/2013/657508</pub-id><pub-id pub-id-type="medline">24102028</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carrubba</surname><given-names>C</given-names> </name><name name-style="western"><surname>Torre</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Langeard</surname><given-names>A</given-names> </name><name name-style="western"><surname>Temprado</surname><given-names>JJ</given-names> </name></person-group><article-title>Enhancing cognition in older adults with interactive wall exergames</article-title><source>Sci Rep</source><year>2025</year><month>10</month><day>23</day><volume>15</volume><issue>1</issue><fpage>37104</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-21060-z</pub-id><pub-id pub-id-type="medline">41131194</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tricco</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Lillie</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zarin</surname><given-names>W</given-names> </name><etal/></person-group><article-title>PRISMA extension for scoping reviews (PRISMA&#x2011;ScR): checklist and explanation</article-title><source>Ann Intern Med</source><year>2018</year><month>10</month><day>2</day><volume>169</volume><issue>7</issue><fpage>467</fpage><lpage>473</lpage><pub-id pub-id-type="doi">10.7326/M18-0850</pub-id><pub-id pub-id-type="medline">30178033</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Silacci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Giachetti</surname><given-names>B</given-names> </name><name name-style="western"><surname>Angelini</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Large language model-based agents for physical activity and cognitive training: a scoping review</article-title><source>JMIR Preprints</source><access-date>2026-02-20</access-date><comment>Preprint posted online on  Jul 4, 2025</comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>P</given-names> </name><name name-style="western"><surname>Karmakar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Leta</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mayr</surname><given-names>P</given-names> </name></person-group><article-title>The journal coverage of Web of Science, Scopus and Dimensions: a comparative analysis</article-title><source>Scientometrics</source><year>2021</year><month>06</month><volume>126</volume><issue>6</issue><fpage>5113</fpage><lpage>5142</lpage><pub-id pub-id-type="doi">10.1007/s11192-021-03948-5</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gusenbauer</surname><given-names>M</given-names> </name></person-group><article-title>Search where you will find most: comparing the disciplinary coverage of 56 bibliographic databases</article-title><source>Scientometrics</source><year>2022</year><volume>127</volume><issue>5</issue><fpage>2683</fpage><lpage>2745</lpage><pub-id pub-id-type="doi">10.1007/s11192-022-04289-7</pub-id><pub-id pub-id-type="medline">35571007</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 11, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guerreiro</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Angelini</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rafael Henriques</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Conversational agents for health and well&#x2011;being across the life course: protocol for an evidence map</article-title><source>JMIR Res Protoc</source><year>2021</year><month>09</month><day>17</day><volume>10</volume><issue>9</issue><fpage>e26680</fpage><pub-id pub-id-type="doi">10.2196/26680</pub-id><pub-id pub-id-type="medline">34533460</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><source>Covidence</source><access-date>2026-02-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://covidence.org">https://covidence.org</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gale</surname><given-names>NK</given-names> </name><name name-style="western"><surname>Heath</surname><given-names>G</given-names> </name><name name-style="western"><surname>Cameron</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rashid</surname><given-names>S</given-names> </name><name name-style="western"><surname>Redwood</surname><given-names>S</given-names> </name></person-group><article-title>Using the framework method for the analysis of qualitative data in multi-disciplinary health research</article-title><source>BMC Med Res Methodol</source><year>2013</year><month>09</month><day>18</day><volume>13</volume><issue>1</issue><fpage>117</fpage><pub-id pub-id-type="doi">10.1186/1471-2288-13-117</pub-id><pub-id pub-id-type="medline">24047204</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Salehzadeh Niksirat</surname><given-names>K</given-names> </name><name name-style="western"><surname>Goswami</surname><given-names>L</given-names> </name><name name-style="western"><surname>S. B. Rao</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Changes in research ethics, openness, and transparency in empirical studies between CHI 2017 and CHI 2022</article-title><conf-name>Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems</conf-name><conf-date>Apr 23-28, 2023</conf-date><conf-loc>Hamburg, Germany</conf-loc><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1145/3544548.3580848</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bak</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chin</surname><given-names>J</given-names> </name></person-group><article-title>The potential and limitations of large language models in identification of the states of motivations for facilitating health behavior change</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2047</fpage><lpage>2053</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae057</pub-id><pub-id pub-id-type="medline">38527272</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Washif</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Pagaduan</surname><given-names>J</given-names> </name><name name-style="western"><surname>James</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dergaa</surname><given-names>I</given-names> </name><name name-style="western"><surname>Beaven</surname><given-names>CM</given-names> </name></person-group><article-title>Artificial intelligence in sport: exploring the potential of using ChatGPT in resistance training prescription</article-title><source>Biol Sport</source><year>2024</year><month>03</month><volume>41</volume><issue>2</issue><fpage>209</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.5114/biolsport.2024.132987</pub-id><pub-id pub-id-type="medline">38524820</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dergaa</surname><given-names>I</given-names> </name><name name-style="western"><surname>Saad</surname><given-names>HB</given-names> </name><name name-style="western"><surname>El Omri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Using artificial intelligence for exercise prescription in personalised health promotion: a critical evaluation of OpenAI&#x2019;s GPT-4 model</article-title><source>Biol Sport</source><year>2024</year><month>03</month><volume>41</volume><issue>2</issue><fpage>221</fpage><lpage>241</lpage><pub-id pub-id-type="doi">10.5114/biolsport.2024.133661</pub-id><pub-id pub-id-type="medline">38524814</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pugliese</surname><given-names>N</given-names> </name><name name-style="western"><surname>Polverini</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lombardi</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation of ChatGPT as a counselling tool for Italian&#x2011;speaking MASLD patients: assessment of accuracy, completeness and comprehensibility</article-title><source>J Pers Med</source><year>2024</year><month>05</month><day>26</day><volume>14</volume><issue>6</issue><fpage>568</fpage><pub-id pub-id-type="doi">10.3390/jpm14060568</pub-id><pub-id pub-id-type="medline">38929789</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zaleski</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Berkowsky</surname><given-names>R</given-names> </name><name name-style="western"><surname>Craig</surname><given-names>KJT</given-names> </name><name name-style="western"><surname>Pescatello</surname><given-names>LS</given-names> </name></person-group><article-title>Comprehensiveness, accuracy, and readability of exercise recommendations provided by an AI&#x2011;based chatbot: mixed methods study</article-title><source>JMIR Med Educ</source><year>2024</year><month>01</month><day>11</day><volume>10</volume><fpage>e51308</fpage><pub-id pub-id-type="doi">10.2196/51308</pub-id><pub-id pub-id-type="medline">38206661</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vandelanotte</surname><given-names>C</given-names> </name><name name-style="western"><surname>Trost</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hodgetts</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Increasing physical activity using an just-in-time adaptive digital assistant supported by machine learning: a novel approach for hyper-personalised mHealth interventions</article-title><source>J Biomed Inform</source><year>2023</year><month>08</month><volume>144</volume><fpage>104435</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2023.104435</pub-id><pub-id pub-id-type="medline">37394024</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>X</given-names> </name><name name-style="western"><surname>Teljeur</surname><given-names>I</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bosch</surname><given-names>JA</given-names> </name></person-group><article-title>Can a funny chatbot make a difference? Infusing humor into conversational agent for behavioral intervention</article-title><year>2024</year><month>07</month><day>8</day><conf-name>CUI &#x2019;24</conf-name><conf-date>Jul 8-10, 2024</conf-date><conf-loc>Luxembourg, Luxembourg</conf-loc><fpage>1</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1145/3640794.3665555</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Designing scaffolding strategies for conversational agents in dialog task of neurocognitive disorders screening</article-title><conf-name>Proceedings of the 2024 CHI Conference on Human Factors in Computing Sytems</conf-name><conf-date>May 11-16, 2024</conf-date><conf-loc>Honolulu, HI</conf-loc><fpage>1</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1145/3613904.3642960</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Norel</surname><given-names>R</given-names> </name></person-group><article-title>Large language models as a tool for cognitive stimulation: chatbot book clubs for seniors</article-title><conf-name>2024 IEEE International Conference on Digital Health (ICDH)</conf-name><conf-date>Jul 7-13, 2024</conf-date><conf-loc>Shenzhen, China</conf-loc><fpage>123</fpage><lpage>125</lpage><pub-id pub-id-type="doi">10.1109/ICDH62654.2024.00029</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Favela</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cruz-Sandoval</surname><given-names>D</given-names> </name><name name-style="western"><surname>Parra</surname><given-names>MO</given-names> </name></person-group><article-title>Conversational agents for dementia using large language models</article-title><conf-name>2023 Mexican International Conference on Computer Science (ENC)</conf-name><conf-date>Sep 11-13, 2023</conf-date><conf-loc>Guanajuato, Guanajuato, Mexico</conf-loc><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1109/ENC60556.2023.10508610</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="web"><source>ChatGPT</source><access-date>2026-02-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://chatgpt.com">https://chatgpt.com</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cruz-Sandoval</surname><given-names>D</given-names> </name><name name-style="western"><surname>Favela</surname><given-names>J</given-names> </name></person-group><article-title>Incorporating conversational strategies in a social robot to interact with people with dementia</article-title><source>Dement Geriatr Cogn Disord</source><year>2019</year><volume>47</volume><issue>3</issue><fpage>140</fpage><lpage>148</lpage><pub-id pub-id-type="doi">10.1159/000497801</pub-id><pub-id pub-id-type="medline">31247627</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weimann</surname><given-names>TG</given-names> </name><name name-style="western"><surname>Schlieter</surname><given-names>H</given-names> </name><name name-style="western"><surname>Brendel</surname><given-names>AB</given-names> </name></person-group><article-title>Virtual coaches: background, theories, and future research directions</article-title><source>Bus Inf Syst Eng</source><year>2022</year><volume>64</volume><issue>4</issue><fpage>515</fpage><lpage>528</lpage><pub-id pub-id-type="doi">10.1007/s12599-022-00757-9</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zhan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Such</surname><given-names>J</given-names> </name></person-group><article-title>Building better AI agents: a provocation on the utilisation of persona in LLM-based conversational agents</article-title><conf-name>Proceedings of the 6th Conference on ACM Conversational User Interfaces, CUI 2024</conf-name><conf-date>Jul 8-10, 2024</conf-date><conf-loc>Luxembourg, Luxembourg</conf-loc><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1145/3640794.3665887</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bhandari</surname><given-names>P</given-names> </name><name name-style="western"><surname>Fay</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wise</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>Can LLM agents maintain a persona in discourse?</article-title><conf-name>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 4-9, 2025</conf-date><conf-loc>Suzhou, China</conf-loc><fpage>29201</fpage><lpage>29217</lpage><pub-id pub-id-type="doi">10.18653/v1/2025.emnlp-main.1487</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hsu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name></person-group><article-title>From personas to talks: revisiting the impact of personas on LLM-synthesized emotional support conversations</article-title><conf-name>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 4-9, 2025</conf-date><conf-loc>Suzhou, China</conf-loc><fpage>5439</fpage><lpage>5453</lpage><pub-id pub-id-type="doi">10.18653/v1/2025.emnlp-main.277</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>N</given-names> </name><name name-style="western"><surname>Frieske</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Survey of hallucination in natural language generation</article-title><source>ACM Comput Surv</source><year>2023</year><month>12</month><day>31</day><volume>55</volume><issue>12</issue><fpage>1</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1145/3571730</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Logeswaran</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jurgens</surname><given-names>D</given-names> </name></person-group><article-title>When "a helpful assistant" is not really helpful: personas in system prompts do not improve performances of large language models</article-title><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2024</conf-name><conf-date>Nov 12-16, 2024</conf-date><conf-loc>Miami, FL</conf-loc><fpage>15126</fpage><lpage>15154</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.findings-emnlp.888</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shanahan</surname><given-names>M</given-names> </name><name name-style="western"><surname>McDonell</surname><given-names>K</given-names> </name><name name-style="western"><surname>Reynolds</surname><given-names>L</given-names> </name></person-group><article-title>Role play with large language models</article-title><source>Nature New Biol</source><year>2023</year><month>11</month><volume>623</volume><issue>7987</issue><fpage>493</fpage><lpage>498</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06647-8</pub-id><pub-id pub-id-type="medline">37938776</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kroczek</surname><given-names>LOH</given-names> </name><name name-style="western"><surname>May</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hettenkofer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ruider</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ludwig</surname><given-names>B</given-names> </name><name name-style="western"><surname>M&#x00FC;hlberger</surname><given-names>A</given-names> </name></person-group><article-title>The influence of persona and conversational task on social interactions with a LLM-controlled embodied conversational agent</article-title><source>Comput Human Behav</source><year>2025</year><month>11</month><volume>172</volume><fpage>108759</fpage><pub-id pub-id-type="doi">10.1016/j.chb.2025.108759</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lazik</surname><given-names>CK</given-names> </name><name name-style="western"><surname>Katins</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kauter</surname><given-names>C</given-names> </name><etal/></person-group><article-title>The impostor is among us: can large language models capture the complexity of human personas?</article-title><conf-name>Proceedings of Mensch und Computer 2025 (MuC &#x2019;25)</conf-name><conf-date>Aug 31 to Sep 3, 2025</conf-date><conf-loc>Chemnitz, Germany</conf-loc><fpage>434</fpage><lpage>451</lpage><pub-id pub-id-type="doi">10.1145/3743049.3743057</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nass</surname><given-names>C</given-names> </name><name name-style="western"><surname>Moon</surname><given-names>Y</given-names> </name></person-group><article-title>Machines and mindlessness: social responses to computers</article-title><source>J Soc Issues</source><year>2000</year><month>01</month><volume>56</volume><issue>1</issue><fpage>81</fpage><lpage>103</lpage><pub-id pub-id-type="doi">10.1111/0022-4537.00153</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pataranutaporn</surname><given-names>P</given-names> </name><name name-style="western"><surname>Danry</surname><given-names>V</given-names> </name><name name-style="western"><surname>Leong</surname><given-names>J</given-names> </name><etal/></person-group><article-title>AI-generated characters for supporting personalized learning and well-being</article-title><source>Nat Mach Intell</source><year>2021</year><volume>3</volume><issue>12</issue><fpage>1013</fpage><lpage>1022</lpage><pub-id pub-id-type="doi">10.1038/s42256-021-00417-9</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McIntosh</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Susnjak</surname><given-names>T</given-names> </name><name name-style="western"><surname>Arachchilage</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Inadequacies of large language model benchmarks in the era of generative artificial intelligence</article-title><source>IEEE Trans Artif Intell</source><year>2025</year><volume>7</volume><issue>1</issue><fpage>22</fpage><lpage>39</lpage><pub-id pub-id-type="doi">10.1109/TAI.2025.3569516</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey on evaluation of large language models</article-title><source>ACM Trans Intell Syst Technol</source><year>2024</year><month>06</month><day>30</day><volume>15</volume><issue>3</issue><fpage>1</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1145/3641289</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hanauer</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>K</given-names> </name></person-group><article-title>Exploring reproducibility issues related to the use of large language models in the clinical domain [Poster]</article-title><access-date>2026-02-17</access-date><conf-name>AMIA 2025 Informatics Summit</conf-name><conf-date>Mar 10-13, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.davidhanauer.com/CV/abstracts-presentations/2025/LLM_reproducibility_AMIA_abstract.pdf">https://www.davidhanauer.com/CV/abstracts-presentations/2025/LLM_reproducibility_AMIA_abstract.pdf</ext-link></comment></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zamfirescu-Pereira</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>RY</given-names> </name><name name-style="western"><surname>Hartmann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Q</given-names> </name></person-group><article-title>Why Johnny can&#x2019;t prompt: how non-AI experts try (and fail) to design LLM prompts</article-title><conf-name>Proceedings of the 2023 CHI Conference on Human Factors in Computing Systems (CHI &#x2019;23)</conf-name><conf-date>Apr 23-28, 2023</conf-date><conf-loc>Hamburg, Germany</conf-loc><fpage>1</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1145/3544548.3581388</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Demszky</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yeager</surname><given-names>DS</given-names> </name><etal/></person-group><article-title>Using large language models in psychology</article-title><source>Nat Rev Psychol</source><year>2023</year><volume>2</volume><issue>11</issue><fpage>688</fpage><lpage>701</lpage><pub-id pub-id-type="doi">10.1038/s44159-023-00241-5</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shool</surname><given-names>S</given-names> </name><name name-style="western"><surname>Adimi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saboori Amleshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bitaraf</surname><given-names>E</given-names> </name><name name-style="western"><surname>Golpira</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tara</surname><given-names>M</given-names> </name></person-group><article-title>A systematic review of large language model (LLM) evaluations in clinical medicine</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>03</month><day>7</day><volume>25</volume><issue>1</issue><fpage>117</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-02954-4</pub-id><pub-id pub-id-type="medline">40055694</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afroogh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Phillips</surname><given-names>C</given-names> </name></person-group><article-title>Navigating LLM ethics: advancements, challenges, and future directions</article-title><source>AI Ethics</source><year>2025</year><month>12</month><volume>5</volume><issue>6</issue><fpage>5795</fpage><lpage>5819</lpage><pub-id pub-id-type="doi">10.1007/s43681-025-00814-5</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name></person-group><article-title>Evaluating the transparency and explainability of LLM-based educational systems</article-title><source>Social Science Research Network (SSRN)</source><access-date>2026-02-05</access-date><comment>Preprint posted online on  Mar 31, 2025</comment><comment><ext-link ext-link-type="uri" xlink:href="https://ssrn.com/abstract=5198565">https://ssrn.com/abstract=5198565</ext-link></comment></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shanmugarasa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rakotoarivelo</surname><given-names>T</given-names> </name></person-group><article-title>Privacy meets explainability: managing confidential data and transparency policies in LLM-empowered science</article-title><conf-name>Proceedings of the Extended Abstracts of the CHI Conference on Human Factors in Computing Systems (CHI EA &#x2019;25)</conf-name><conf-date>Apr 26 to May 1, 2025</conf-date><conf-loc>Yokohama, Japan</conf-loc><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1145/3706599.3720099</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barman</surname><given-names>KG</given-names> </name><name name-style="western"><surname>Wood</surname><given-names>N</given-names> </name><name name-style="western"><surname>Pawlowski</surname><given-names>P</given-names> </name></person-group><article-title>Beyond transparency and explainability: on the need for adequate and contextualized user guidelines for LLM use</article-title><source>Ethics Inf Technol</source><year>2024</year><month>09</month><volume>26</volume><issue>3</issue><fpage>47</fpage><pub-id pub-id-type="doi">10.1007/s10676-024-09778-2</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Franc</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hart</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hata</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hertelendy</surname><given-names>A</given-names> </name></person-group><article-title>Repeatability, reproducibility, and diagnostic accuracy of a commercial large language model (ChatGPT) to perform emergency department triage using the Canadian triage and acuity scale</article-title><source>CJEM</source><year>2024</year><month>01</month><volume>26</volume><issue>1</issue><fpage>40</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1007/s43678-023-00616-w</pub-id><pub-id pub-id-type="medline">38206515</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Staudinger</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kusa</surname><given-names>W</given-names> </name><name name-style="western"><surname>Piroi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lipani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hanbury</surname><given-names>A</given-names> </name></person-group><article-title>A reproducibility and generalizability study of large language models for query generation</article-title><conf-name>Proceedings of the Annual International ACM SIGIR Conference on Research and Development in Information Retrieval in the Asia Pacific Region (SIGIR&#x2011;AP &#x2019;24)</conf-name><conf-date>Dec 9-12, 2024</conf-date><conf-loc>Tokyo, Japan</conf-loc><fpage>186</fpage><lpage>196</lpage><pub-id pub-id-type="doi">10.1145/3673791.3698432</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Baltes</surname><given-names>S</given-names> </name><name name-style="western"><surname>Angermeir</surname><given-names>F</given-names> </name><name name-style="western"><surname>Arora</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Guidelines for empirical studies in software engineering involving large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 21, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2508.15503</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wagner</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bar&#x00F3;n</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Falessi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Baltes</surname><given-names>S</given-names> </name></person-group><article-title>Towards evaluation guidelines for empirical studies involving llms</article-title><conf-name>2025 IEEE/ACM International Workshop on Methodological Issues with Empirical Studies in Software Engineering (WSESE)</conf-name><conf-date>May 3, 2025</conf-date><pub-id pub-id-type="doi">10.1109/WSESE66602.2025.00011</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The comprehensive, open-access materials necessary to ensure the transparency, reproducibility, and verifiability of the scoping review.</p><media xlink:href="ai_v5i1e80123_app1.zip" xlink:title="ZIP File, 1201 KB"/></supplementary-material><supplementary-material id="app2"><label>Checklist 1</label><p>PRISMA-ScR checklist.</p><media xlink:href="ai_v5i1e80123_app2.doc" xlink:title="DOC File, 121 KB"/></supplementary-material></app-group></back></article>