<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e71798</article-id><article-id pub-id-type="doi">10.2196/71798</article-id><article-categories><subj-group subj-group-type="heading"><subject>Viewpoint</subject></subj-group></article-categories><title-group><article-title>Rethinking AI Workflows: Guidelines for Scientific Evaluation in Digital Health Companies</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>McAlister</surname><given-names>Kelsey Lynn</given-names></name><degrees>MS, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gonzales</surname><given-names>Lee</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huberty</surname><given-names>Jennifer</given-names></name><degrees>MS, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Fit Minded, Inc</institution><addr-line>2901 E Greenway Road, PO Box 30271</addr-line><addr-line>Phoenix</addr-line><addr-line>AZ</addr-line><country>United States</country></aff><aff id="aff2"><institution>Catalyst AI</institution><addr-line>Denver</addr-line><addr-line>CO</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Emam</surname><given-names>Khaled El</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mentis</surname><given-names>Alexios-Fotios A</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Sasseville</surname><given-names>Maxime</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Kelsey Lynn McAlister, MS, PhD, Fit Minded, Inc, 2901 E Greenway Road, PO Box 30271, Phoenix, AZ, 85046, United States, 1 (602) 935-6986; <email>kelsey@fit-minded.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>4</day><month>12</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e71798</elocation-id><history><date date-type="received"><day>26</day><month>01</month><year>2025</year></date><date date-type="rev-recd"><day>05</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>30</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Kelsey Lynn McAlister, Lee Gonzales, Jennifer Huberty. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 4.12.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e71798"/><abstract><p>Artificial intelligence (AI) is revolutionizing digital health, driving innovation in care delivery and operational efficiency. Despite its potential, many AI systems fail to meet real-world expectations due to limited evaluation practices that focus narrowly on short-term metrics like efficiency and technical accuracy. Ignoring factors such as usability, trust, transparency, and adaptability hinders AI adoption, scalability, and long-term impact in health care. This paper emphasizes the importance of embedding scientific evaluation as a core operational layer throughout the AI life cycle. We outline practical guidelines for digital health companies to improve AI integration and evaluation, informed by over 35 years of experience in science, the digital health industry, and AI development. It describes a multistep approach, including stakeholder analysis, real-time monitoring, and iterative improvement, that digital health companies can adopt to ensure robust AI integration. Key recommendations include assessing stakeholder needs, designing AI systems that can check its own work, conducting testing to address usability and biases, and ensuring continuous improvement to keep systems user-centered and adaptable. By integrating these guidelines, digital health companies can improve AI reliability, scalability, and trustworthiness, driving better health care delivery and stakeholder alignment.</p></abstract><kwd-group><kwd>industry</kwd><kwd>AI integration</kwd><kwd>user-centered design</kwd><kwd>health care delivery</kwd><kwd>digital health</kwd><kwd>workflow</kwd><kwd>scientific evaluation</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Digital health companies are increasingly leveraging artificial intelligence (AI) tools to transform care delivery and improve internal operations. AI is being used to develop customer-facing products, such as mental health chatbots and symptom-checking platforms, and to enhance efficiency within organizations, such as accelerating provider documentation workflows [<xref ref-type="bibr" rid="ref1">1</xref>]. AI adoption has surged globally, with the International Business Machines (IBM) Corporation reporting 35% adoption in 2022 [<xref ref-type="bibr" rid="ref2">2</xref>], and McKinsey &#x0026; Company finding this figure had risen to 72% by 2024 [<xref ref-type="bibr" rid="ref3">3</xref>]. This rapid growth underscores the transformative potential of AI, particularly generative AI (ie, technology that creates new content by learning from existing patterns), which is projected to contribute up to $4.4 trillion in economic value in the coming years [<xref ref-type="bibr" rid="ref4">4</xref>]. The American Psychological Association named AI as one of the top 10 trends in shaping the field of mental health, recognizing its growing influence [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>However, AI tools often fail to meet their potential. A recent study highlighted that symptom-checking chatbots frequently provide inaccurate or unhelpful recommendations, eroding user trust and raising patient safety concerns [<xref ref-type="bibr" rid="ref6">6</xref>]. Similarly, AI-powered transcription tools have been shown to fabricate information or introduce critical errors into clinical documentation, jeopardizing their reliability in real-world settings [<xref ref-type="bibr" rid="ref7">7</xref>]. Even in research, where AI can support aspects like code automation and study stimuli creation, challenges such as false outputs and breached ethics raise concerns [<xref ref-type="bibr" rid="ref5">5</xref>]. Additionally, unclear definitions of trust in health care AI contribute to these challenges, hindering ethical and effective translation into practice [<xref ref-type="bibr" rid="ref8">8</xref>]. These issues underscore the importance of integrating robust, scientifically grounded evaluations into AI tools to enhance their reliability, safety, and effectiveness. The evaluations of AI systems currently tend to prioritize key performance indicators (KPIs) such as efficiency for internal tools and technical performance for customer-facing products to demonstrate return on investment, neglecting essential factors such as usability, transparency, trust, and long-term reliability [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. This fragmented approach to AI evaluation results in several challenges, as internal tools frequently face resistance due to poor design or lack of clarity, and external systems often lose user trust when they fail to perform reliably in real-world contexts [<xref ref-type="bibr" rid="ref13">13</xref>]. Additionally, the absence of ongoing evaluation and iterative refinement leaves AI systems unable to adapt to evolving needs, which compounds inefficiencies and reduces their long-term impact [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. These gaps undermine the adoption and scalability of AI solutions and jeopardize their potential to drive sustainable change in digital health care.</p><p>Sadly, many digital health companies have yet to develop the guidelines and expertise needed to integrate AI effectively and maintain rigorous, ongoing evaluation. Without robust and continuous evaluation built in from the beginning, AI systems risk perpetuating errors, failing to meet stakeholder needs, and losing the trust of end-users. To address the shortcomings of current AI evaluations, including the emphasis on short-term KPIs, neglect of user-centered factors, and the lack of ongoing evaluations, its integration should be approached with a scientific mindset that prioritizes evidence-based methods and continuous learning. A recent clinical-trials-inspired framework emphasizes safety, efficacy, and monitoring, but translating high-level guidance into operational practice remains a challenge for digital health companies [<xref ref-type="bibr" rid="ref15">15</xref>]. The purpose of this paper is to highlight the importance of embedding scientific evaluation as a core operational layer within AI workflows by providing practical guidelines for decision-makers (eg, C-suite leaders, product and operations leads, clinical directors, and AI implementation managers) in digital health companies. These guidelines are informed by over 35 years of cumulative experience in science, the digital health industry, and AI development. Adopting scientific practices offers digital health companies a pathway to strengthen their approach to AI integration, supporting more reliable and impactful outcomes in digital health care, differentiating themselves from competitors and contributing to revenue generation.</p></sec><sec id="s2"><title>Guidelines for Integrating and Evaluating AI in Digital Health Companies</title><sec id="s2-1"><title>Overview</title><p>The following guidelines outline key steps and recommendations to support digital health company leaders in integrating evaluation processes throughout the life cycle of AI systems, enhancing their effectiveness, scalability, and trustworthiness. While conceptually aligned with implementation science frameworks used in AI&#x2013;such as Consolidated Framework for Implementation Research, which highlights contextual and organizational factors that influence implementation [<xref ref-type="bibr" rid="ref16">16</xref>], and Proctor&#x2019;s outcomes, which define success through measures like feasibility and sustainability [<xref ref-type="bibr" rid="ref17">17</xref>]&#x2013;these recommendations are tailored to the fast-paced, cross-functional environments in which AI is developed and deployed in digital health.</p></sec><sec id="s2-2"><title>Evaluate Stakeholder Needs Before Implementation</title><p>Understanding the priorities and needs of stakeholders is a crucial first step to ensure AI systems align with real-world challenges and expectations. Stakeholders may include patients, clinicians, administrators, employees, and app users or consumers, depending on whether the system is designed for internal operations or as an external product. For example, evidence from intensive care settings highlights how involving diverse stakeholders in preimplementation assessments can significantly enhance the success of pilot testing, leading to better AI integration and usability [<xref ref-type="bibr" rid="ref18">18</xref>]. This approach also facilitates a scientifically grounded evaluation of potential barriers to adoption, such as workflow disruptions or concerns about transparency, that might otherwise hinder long-term success. In addition, co-creation approaches, where stakeholders actively help design and refine AI systems, add value by going beyond traditional consultation [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>]. These participatory approaches improve alignment with contextual knowledge, increase trust, and promote long-term adoption of AI tools in health care settings [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. By applying scientific evaluation methods at this stage, behavioral and AI scientists, product developers, and operational leaders can systematically identify and address the specific needs and priorities of intended users, guiding the selection and design of AI systems that are both evidence-based and effective in meeting stakeholder requirements.</p><p>To assess stakeholder needs effectively, digital health companies should:</p><list list-type="bullet"><list-item><p>Leverage collaborations and partnerships with industry experts and research scientists. These partnerships can help ensure that the AI system aligns with scientific standards while also remaining feasible for implementation within the company.</p></list-item><list-item><p>Conduct qualitative and quantitative assessments with end-users to understand expectations, pain points, desired outcomes, and what AI platforms they may already use. Research and user experience teams can spearhead these efforts, leveraging a human-centered approach to ensure the system aligns with real-world needs and user priorities [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Qualitative methods offer in-depth insights, while quantitative approaches help capture broader trends across diverse user groups. For example, a digital health company could interview clinicians to uncover documentation challenges, then run a survey to assess anticipated usability, perceived efficiency, and readiness to adopt an AI tool.</p></list-item><list-item><p>Use co-creation strategies, such as co-design workshops or participatory prototyping, to allow stakeholders to directly influence system functionality, content, and workflows. These methods surface context-specific needs that traditional assessments may miss and help improve usability, trust, and alignment with end-user expectations.</p></list-item><list-item><p>Develop user personas and journey maps to understand how the AI system fits into existing workflows or end-user experiences. This approach can help teams visualize user interactions, surface potential friction points, and inform refinements that support usability and integration, especially when combined with direct stakeholder input gathered through participatory design activities.</p></list-item></list></sec><sec id="s2-3"><title>Design AI Systems That Check Its Own Work</title><p>To ensure that AI systems are robust, effective, and user-centered throughout their life cycle, digital health companies should embed scientific evaluation mechanisms directly into their design. However, many companies currently underinvest in rigorous evaluation processes, leading to inconsistent progress, flawed AI tools that fail to meet business objectives, canceled projects, and wasted resources [<xref ref-type="bibr" rid="ref26">26</xref>]. AI evaluations should balance traditional KPIs, such as accuracy and efficiency, with metrics that allow the system to monitor and reflect on user experience, trust, usability, and satisfaction. By enabling AI tools to &#x201C;check their own work,&#x201D; companies can create systems that not only meet company goals but also foster user trust and adoption&#x2014;key factors for achieving sustained impact and scalability in health care settings. This includes designing systems that can detect uncertainty, surface potential issues, and escalate to human input when appropriate. Companies should consider the cross-functional work of teams such as engineering, technology, and science to ensure appropriate design, implementation, and evaluation of AI systems that align with stakeholder needs and operational goals.</p><p>In particular, digital health companies should integrate human-in-the-loop (HITL) methodologies, an approach that embeds human judgment into the training, validation, and deployment phases of AI tools. This enables teams to guide model development, intervene during deployment, and refine outputs in real-time, improving adaptability, safety, and trustworthiness [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. HITL is distinct from broader governance or post-hoc audits in that it provides direct, real-time oversight within system workflows. This is especially important in clinical and behavioral health contexts, where ethical and contextual judgment cannot be fully automated.</p><p>Several scientific frameworks have been developed to evaluate AI tools, offering valuable guidance on embedding evaluation into the design process. Frameworks such as Standard Protocol Items: Recommendations for Interventional Trials-AI (SPIRIT-AI; [<xref ref-type="bibr" rid="ref29">29</xref>]) and Consolidated Standards of Reporting Trials&#x2013;AI (CONSORT-AI; [<xref ref-type="bibr" rid="ref30">30</xref>]) focus on building transparency, trust, and rigor during the design and reporting phases of clinical trials for AI. While these frameworks emphasize preimplementation evaluation, others, such as Translational Evaluation of Healthcare AI (TEHAI; [<xref ref-type="bibr" rid="ref31">31</xref>]) and Explainable AI (XAI; [<xref ref-type="bibr" rid="ref32">32</xref>]), address specific aspects like performance, safety, ethical considerations, and user trust.</p><p>While these frameworks provide a strong foundation, they often focus on discrete stages of evaluation and may not fully incorporate HITL approaches that enable continuous input and oversight throughout the AI life cycle. Digital health companies must go further by embedding evaluation mechanisms into workflows to ensure continuous monitoring and improvement. Without such mechanisms, teams risk uneven progress, flawed implementations, and ultimately, AI tools that fail to meet stakeholder needs or achieve business goals [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>When building AI systems that can monitor themselves, digital health companies should:</p><list list-type="bullet"><list-item><p>Prioritize early investment in AI evaluation to build a strong foundation for assessing effectiveness throughout the AI tool&#x2019;s life cycle. This approach ensures potential challenges are proactively addressed, which supports smoother implementation and long-term adaptability. This may include consulting with behavioral or AI scientists to design evidence-based evaluation methods, identify potential biases, and refine AI system performance to align better with real-world needs.</p></list-item><list-item><p>Consider existing scientific frameworks as a foundation for designing AI tools with transparency and rigor, while adapting them to include mechanisms for ongoing, real-world evaluation that captures both technical performance and comprehensive user experience metrics.</p></list-item><list-item><p>Develop automated tools for ongoing evaluation that track metrics aligned with both user priorities and business objectives, such as technical accuracy, error rates, user satisfaction, and productivity. These evaluations streamline development by concentrating efforts on critical areas, increasing the likelihood of deploying AI systems that effectively meet organizational goals and end-user needs [<xref ref-type="bibr" rid="ref26">26</xref>]. For example, automated tools could monitor user interactions on a mental health platform, such as response times, task completion rates, and drop-off points, allowing product teams and behavioral or AI scientists to identify areas for improvement and enhance user experience.</p></list-item><list-item><p>Establish feedback loops that allow for end-users to provide feedback in real time, ensuring their perceptions are consistently captured and integrated into system updates.</p></list-item><list-item><p>Embed HITL components such as human review panels, clinician-in-the-loop decision support, or structured escalation processes that ensure human judgment is available at key junctures. HITL differs from general human oversight or feedback mechanisms in that it places human judgment directly within the AI system&#x2019;s workflow, enabling real-time intervention to correct system drift, mitigate error propagation, and uphold ethical safeguards.</p></list-item><list-item><p>Incorporate routine bias audits into evaluation workflows to assess whether the AI system performs equitably across user subgroups. This is particularly important in health settings where automated systems can unintentionally amplify disparities, especially among low-prevalence or underserved populations [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Regularly reviewing model outputs by demographic characteristics and edge cases can help teams identify and mitigate bias early in the deployment cycle.</p></list-item><list-item><p>Establish human oversight to ensure accountability, mitigate potential biases, and validate performance. This includes establishing a scientific AI evaluation leader or multidisciplinary review teams to regularly assess the system&#x2019;s outputs, identify blind spots in automated evaluations, and ensure alignment with company goals and user needs. Human input is critical for addressing nuances and ethical considerations that AI alone may overlook, ensuring the system&#x2019;s outputs remain contextually appropriate and trustworthy. Oversight reinforces governance and long-term trust, complementing the real-time, embedded nature of HITL.</p></list-item></list></sec><sec id="s2-4"><title>Testing and Refinement Before Implementation</title><p>AI systems should undergo beta, feasibility, and pilot testing, which involves the collaboration of research (ie, behavioral and AI scientists), user experience, product, engineering, and operations teams, to ensure they are ready for real-world implementation. These phases allow opportunity to identify potential issues related to usability, performance, and integration within real-world workflows before full implementation. For example, beta testing helps gather quick feedback from real-world users to refine usability and make early improvements. Feasibility can be used to evaluate resource requirements and alignment with the business goals of the company, ensuring practical deployment and sustainability. Pilot testing can also be used to further refine the AI tool and assess initial outcomes for viability. For instance, pilot testing has been essential in improving the intuitiveness of health care chatbots [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>To effectively test and refine before implementation, digital health companies should:</p><list list-type="bullet"><list-item><p>Conduct feasibility tests early to assess whether the AI tool aligns with business objectives, technical infrastructure, and resource availability. This ensures the AI is viable and positioned for successful implementation.</p></list-item><list-item><p>Engage diverse stakeholders, such as clinicians, administrators, and end-users during testing and refinement to gather comprehensive feedback. For example, when testing an AI-powered clinical decision support tool, a company could engage physicians to ensure recommendations align with clinical guidelines, administrators to assess integration with existing electronic health record systems, and nurses to evaluate usability and workflow compatibility.</p></list-item><list-item><p>Iterate, based on findings, by using the feedback from these testing stages to refine the AI system, addressing issues such as workflow integration and potential user resistance. Structured reviewer input or user flagging mechanisms, when embedded into the system&#x2019;s operation, can function as HITL approaches that support more responsive and ethical refinement. AI scientists and operations teams should work closely with behavioral scientists to ensure the system evolves based on real-world insights.</p></list-item></list></sec><sec id="s2-5"><title>Implement With Real-Time Monitoring and Data Collection</title><p>Implementing an AI tool is an important opportunity for digital health companies to gather actionable insights about how they perform in real-world settings. Real-time monitoring and data collection allow companies to identify emerging issues, refine workflows, and validate that AI tools meet both technical and user-centered expectations. For example, LinkedIn leveraged a deep-learning-based monitoring system to track the health of its AI models, identifying issues in real-time to improve business outcomes [<xref ref-type="bibr" rid="ref37">37</xref>]. This proactive approach supports scalability and long-term adoption by addressing challenges early in deployment. Engineering, operations, and research (eg, behavioral and AI scientists and data analysts) should collaborate to establish monitoring systems and analyze findings.</p><p>To implement real-time monitoring and data collection effectively, digital health companies should:</p><list list-type="bullet"><list-item><p>Deploy automated, self-check monitoring systems to continuously track both traditional KPIs (eg, accuracy, response times, and error rates) and user experience metrics (eg, task completion rates, interaction frequency, and perceived usability). Leverage the evaluation mechanisms embedded during the AI system&#x2019;s design.</p></list-item><list-item><p>Analyze incoming data to systematically identify patterns or recurring issues that may impact the AI&#x2019;s performance or user engagement. Structured analyses, conducted by the company&#x2019;s data analysts and behavioral scientists, help prioritize areas for improvement and ensure resources are allocated effectively. For example, by analyzing user interaction patterns, a company might find that users tend to leave an AI-powered chat when provided with lengthy responses, prompting the need to shorten message length to improve engagement.</p></list-item><list-item><p>Implement strategic refinements based on monitored insights to address significant challenges or adapt the AI system to evolving user needs and company priorities. Postlaunch updates should be carefully planned and aligned with long-term goals. Where appropriate, HITL mechanisms can support these refinements by enabling human input in ambiguous, high-stakes, or ethically sensitive situations.</p></list-item></list></sec><sec id="s2-6"><title>Continue Evaluation and Iterative Improvement After Implementation</title><p>Once an AI system is implemented, ongoing evaluation becomes essential to ensure it continues to meet company goals and user expectations. AI tools often experience performance degradation over time as changes in usage patterns, data inputs, workflows, user needs, and external factors (eg, regulatory changes, updates to clinical guidelines) require them to adapt to maintain their effectiveness and relevance. For example, generative AI models (eg, ChatGPT) present unique challenges due to their inherent randomness, making repeated evaluations essential to ensure reliable performance [<xref ref-type="bibr" rid="ref38">38</xref>]. Additionally, the dynamic nature of AI, particularly generative AI, requires digital health companies to continuously adopt and adapt to rapidly improving models with enhanced capabilities and significant cost fluctuations. Recent benchmarking data from Epoch shows that once models reach certain levels of computing power, they experience significant jumps in performance on tasks [<xref ref-type="bibr" rid="ref39">39</xref>]. Furthermore, when GPT-4 was initially released in March 2023, it cost US $36 per million tokens (ie, units of text used to process input and generate output), but by late 2024, this price had dropped to just $0.25 per million tokens&#x2014;a staggering 99% reduction [<xref ref-type="bibr" rid="ref40">40</xref>]. This sharp drop in cost highlights how quickly AI technology evolves, making advanced tools more affordable over time. For digital health companies, this means they must regularly evaluate whether adopting updated models is both practical and beneficial, ensuring they use the most effective and cost-efficient solutions while staying aligned with their goals and user needs. Product teams, behavioral and AI scientists, and operation specialists should collaborate to monitor performance, gather user feedback, and adapt systems to evolving needs and guidelines.</p><p>To continuously evaluate and improve AI systems, digital health companies should:</p><list list-type="bullet"><list-item><p>Conduct regular audits to assess technical metrics, such as accuracy and reliability, alongside user experience metrics, including satisfaction and usability. These audits help identify whether the system is meeting its intended objectives and uncover opportunities for optimization. They should also maintain broader human oversight (beyond HITL mechanisms), which could include a scientific AI evaluation leader and/or a multidisciplinary team</p></list-item><list-item><p>Incorporate usability testing as a continuous process to regularly identify pain points and opportunities for improvement among diverse user groups. Regularly engaging with end-users ensures that the system adapts to their evolving needs and remains intuitive and efficient. For example, ongoing usability testing for an AI-driven mental health platform could involve observing end-users as they navigate key features, such as finding a therapist or accessing self-help tools, to identify usability challenges and inform iterative design improvements.</p></list-item><list-item><p>Prioritize publishing and reporting on AI performance, user experiences, and trust-building metrics throughout AI integration (ie, from beta testing to post-launch). Reporting on technical metrics alongside user-focused insights offers a holistic view of AI system effectiveness. For example, companies can conduct retrospective analyses of de-identified conversation content and usage patterns to identify trends and gaps, guiding future improvements. While peer-reviewed outputs are valuable, resource-constrained teams may benefit from alternative dissemination methods, such as implementation briefs, open-access case reports, webinars, or practice-based repositories, that enable rapid, practical knowledge-sharing. By sharing these findings, companies contribute to greater accountability, advance innovation, and guide the development of AI tools that meet user and company needs.</p></list-item><list-item><p>Maintain human oversight that could include a scientific AI evaluation leader or a multidisciplinary team of technical, clinical, and operational experts. Human oversight is needed to ensure that AI systems can continuously adapt to new data, address unforeseen issues, and uphold ethical and performance standards in dynamic health care settings.</p></list-item></list></sec></sec><sec id="s3" sec-type="conclusions"><title>Conclusions</title><p>The integration of AI into digital health presents a transformative opportunity to enhance care delivery, optimize operations, and improve patient outcomes. However, its success hinges on a commitment to continuous, scientifically grounded evaluation. Scientific evaluation is not just a checkpoint&#x2014;it is an operational layer that should be embedded into workflows to ensure trust, scalability, and measurable impact. While not developed through a formal consensus process or systematic review, the guidelines outlined in this paper are informed by over 35 years of cumulative experience across science, the digital health industry, and AI development. They advocate for incorporating scientific evaluation processes that balance technical performance with user-centered metrics, enabling digital health companies to ensure their AI tools remain effective, adaptable, and trustworthy over time. This approach may enhance the reliability and scalability of AI systems and drive revenue growth by improving user satisfaction, increasing adoption rates, and streamlining operations. Achieving these outcomes requires cross-functional collaboration between behavioral and AI scientists, data analysts, product teams, engineers, and operations staff. Together, these teams can ensure AI solutions are aligned with business objectives, meet stakeholder needs, and deliver meaningful, scalable impact in digital health care. A key future direction is to formally build on these recommendations through a structured, cross-disciplinary consensus process.</p></sec></body><back><notes><sec><title>Disclaimer</title><p>All authors are employees of Fit Minded, Inc. or Catalyst AI. The views expressed in this manuscript are those of the authors and do not necessarily reflect the official position of these organizations.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CONSORT-AI</term><def><p>Consolidated Standards of Reporting Trials&#x2013;AI</p></def></def-item><def-item><term id="abb3">HITL</term><def><p> human-in-the-loop</p></def></def-item><def-item><term id="abb4">KPI</term><def><p>key performance indicator</p></def></def-item><def-item><term id="abb5">SPIRIT-AI</term><def><p>Standard Protocol Items: Recommendations for Interventional Trials-AI</p></def></def-item><def-item><term id="abb6">TEHAI</term><def><p>Translational Evaluation of Healthcare AI</p></def></def-item><def-item><term id="abb7">XAI</term><def><p>Explainable AI</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Koul</surname><given-names>A</given-names> </name><name name-style="western"><surname>Singla</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ijaz</surname><given-names>MF</given-names> </name></person-group><article-title>Artificial intelligence in disease diagnosis: a systematic literature review, synthesizing framework and future research agenda</article-title><source>J Ambient Intell Humaniz Comput</source><year>2023</year><volume>14</volume><issue>7</issue><fpage>8459</fpage><lpage>8486</lpage><pub-id pub-id-type="doi">10.1007/s12652-021-03612-z</pub-id><pub-id pub-id-type="medline">35039756</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="report"><article-title>IBM global AI adoption index 2022</article-title><year>2022</year><access-date>2025-11-13</access-date><publisher-name>IBM</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.snowdropsolution.com/pdf/IBM%20Global%20AI%20Adoption%20Index%202022.pdf">https://www.snowdropsolution.com/pdf/IBM%20Global%20AI%20Adoption%20Index%202022.pdf</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>The state of AI in early 2024</article-title><source>Quantum Black AI by McKinsey</source><year>2024</year><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-state-of-ai">https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-state-of-ai</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="web"><article-title>Implementing generative AI with speed and safety</article-title><source>McKinsey and Company</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/implementing-generative-ai-with-speed-and-safety">https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/implementing-generative-ai-with-speed-and-safety</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><article-title>Artificial intelligence is impacting the field</article-title><source>American Psychological Association</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.apa.org/monitor/2025/01/trends-harnessing-power-of-artificial-intelligence">https://www.apa.org/monitor/2025/01/trends-harnessing-power-of-artificial-intelligence</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johri</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>BA</given-names> </name><etal/></person-group><article-title>An evaluation framework for clinical use of large language models in patient interaction tasks</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>77</fpage><lpage>86</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03328-5</pub-id><pub-id pub-id-type="medline">39747685</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><article-title>Researchers say AI transcription tool used in hospitals invents things no one ever said</article-title><source>AP News</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://apnews.com/article/ai-artificial-intelligence-health-business-90020cdf5fa16c79ca2e5b6c4c9bbb14">https://apnews.com/article/ai-artificial-intelligence-health-business-90020cdf5fa16c79ca2e5b6c4c9bbb14</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>B&#x00FC;rger</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Amann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bui</surname><given-names>CKT</given-names> </name><name name-style="western"><surname>Fehr</surname><given-names>J</given-names> </name><name name-style="western"><surname>Madai</surname><given-names>VI</given-names> </name></person-group><article-title>The unmet promise of trustworthy AI in healthcare: why we fail at clinical translation</article-title><source>Front Digit Health</source><year>2024</year><volume>6</volume><fpage>1279629</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2024.1279629</pub-id><pub-id pub-id-type="medline">38698888</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>AI performance metrics: the science &#x0026; art of measuring AI</article-title><source>version1</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.version1.com/blog/ai-performance-metrics-the-science-and-art-of-measuring-ai/">https://www.version1.com/blog/ai-performance-metrics-the-science-and-art-of-measuring-ai/</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Grootjans</surname><given-names>W</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ranschaert</surname><given-names>E</given-names> </name><name name-style="western"><surname>Mehrizi</surname><given-names>MHR</given-names> </name><name name-style="western"><surname>Grootjans</surname><given-names>W</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>TS</given-names> </name></person-group><source>Evaluation, Monitoring, and Improvement,&#x201D; in AI Implementation in Radiology: Challenges and Opportunities in Clinical Practice</source><year>2024</year><publisher-name>Springer Nature Switzerland</publisher-name><fpage>131</fpage><lpage>159</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-68942-0_8</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>AI&#x2019;s trust problem</article-title><source>Harvard Business Review</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://hbr.org/2024/05/ais-trust-problem">https://hbr.org/2024/05/ais-trust-problem</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oveisi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gholamrezaie</surname><given-names>F</given-names> </name><name name-style="western"><surname>Qajari</surname><given-names>N</given-names> </name><name name-style="western"><surname>Moein</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Goodarzi</surname><given-names>M</given-names> </name></person-group><article-title>Review of artificial intelligence-based systems: evaluation, standards, and methods</article-title><source>Advances in the Standards &#x0026; Applied Sciences</source><year>2024</year><volume>2</volume><issue>2</issue><fpage>4</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.22034/asas.2024.450378.1055</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mennella</surname><given-names>C</given-names> </name><name name-style="western"><surname>Maniscalco</surname><given-names>U</given-names> </name><name name-style="western"><surname>De Pietro</surname><given-names>G</given-names> </name><name name-style="western"><surname>Esposito</surname><given-names>M</given-names> </name></person-group><article-title>Ethical and regulatory challenges of AI technologies in healthcare: a narrative review</article-title><source>Heliyon</source><year>2024</year><month>02</month><day>29</day><volume>10</volume><issue>4</issue><fpage>e26297</fpage><pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e26297</pub-id><pub-id pub-id-type="medline">38384518</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>Companies to shift AI goals in 2025 &#x2014; with setbacks inevitable, Forrester predicts</article-title><source>CIO</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cio.com/article/3583638/companies-to-shift-ai-goals-in-2025-with-setbacks-inevitable-forrester-predicts.html">https://www.cio.com/article/3583638/companies-to-shift-ai-goals-in-2025-with-setbacks-inevitable-forrester-predicts.html</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>You</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Hernandez-Boussard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pfeffer</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Landman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mishuris</surname><given-names>RG</given-names> </name></person-group><article-title>Clinical trials informed framework for real world clinical implementation and deployment of artificial intelligence applications</article-title><source>NPJ Digit Med</source><year>2025</year><month>02</month><day>17</day><volume>8</volume><issue>1</issue><fpage>107</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01506-4</pub-id><pub-id pub-id-type="medline">39962232</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schouten</surname><given-names>B</given-names> </name><name name-style="western"><surname>Schinkel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Boerman</surname><given-names>AW</given-names> </name><etal/></person-group><article-title>Implementing artificial intelligence in clinical practice: a mixed-method study of barriers and facilitators</article-title><source>J Med Artif Intell</source><year>2022</year><month>12</month><volume>5</volume><fpage>12</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.21037/jmai-22-71</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van de Sande</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>EFF</given-names> </name><name name-style="western"><surname>Oosterhoff</surname><given-names>J</given-names> </name><name name-style="western"><surname>van Bommel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gommers</surname><given-names>D</given-names> </name><name name-style="western"><surname>van Genderen</surname><given-names>ME</given-names> </name></person-group><article-title>To warrant clinical adoption AI models require a multi-faceted implementation evaluation</article-title><source>NPJ Digit Med</source><year>2024</year><month>03</month><day>6</day><volume>7</volume><issue>1</issue><fpage>58</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01064-1</pub-id><pub-id pub-id-type="medline">38448743</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mosch</surname><given-names>LK</given-names> </name><name name-style="western"><surname>Poncette</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Spies</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Creation of an evidence-based implementation framework for digital health technology in the intensive care unit: qualitative study</article-title><source>JMIR Form Res</source><year>2022</year><month>04</month><day>8</day><volume>6</volume><issue>4</issue><fpage>e22866</fpage><pub-id pub-id-type="doi">10.2196/22866</pub-id><pub-id pub-id-type="medline">35394445</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Swan</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Peltier</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Dahl</surname><given-names>AJ</given-names> </name></person-group><article-title>Artificial intelligence in healthcare: the value co-creation process and influence of other digital health transformations</article-title><source>JRIM</source><year>2024</year><month>01</month><day>30</day><volume>18</volume><issue>1</issue><fpage>109</fpage><lpage>126</lpage><pub-id pub-id-type="doi">10.1108/JRIM-09-2022-0293</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barile</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bassano</surname><given-names>C</given-names> </name><name name-style="western"><surname>Piciocchi</surname><given-names>P</given-names> </name><name name-style="western"><surname>Saviano</surname><given-names>M</given-names> </name><name name-style="western"><surname>Spohrer</surname><given-names>JC</given-names> </name></person-group><article-title>Empowering value co-creation in the digital age</article-title><source>JBIM</source><year>2021</year><month>05</month><day>30</day><volume>39</volume><issue>6</issue><fpage>1130</fpage><lpage>1143</lpage><pub-id pub-id-type="doi">10.1108/JBIM-12-2019-0553</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nadarzynski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Knights</surname><given-names>N</given-names> </name><name name-style="western"><surname>Husbands</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Achieving health equity through conversational AI: a roadmap for design and implementation of inclusive chatbots in healthcare</article-title><source>PLOS Digit Health</source><year>2024</year><month>05</month><volume>3</volume><issue>5</issue><fpage>e0000492</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000492</pub-id><pub-id pub-id-type="medline">38696359</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nadarzynski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Miles</surname><given-names>O</given-names> </name><name name-style="western"><surname>Cowie</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ridge</surname><given-names>D</given-names> </name></person-group><article-title>Acceptability of artificial intelligence (AI)-led chatbot services in healthcare: a mixed-methods study</article-title><source>Digit Health</source><year>2019</year><volume>5</volume><fpage>2055207619871808</fpage><pub-id pub-id-type="doi">10.1177/2055207619871808</pub-id><pub-id pub-id-type="medline">31467682</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sadasivan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cruz</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dolgoy</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Examining patient engagement in chatbot development approaches for healthy lifestyle and mental wellness interventions: scoping review</article-title><source>J Particip Med</source><year>2023</year><month>05</month><day>22</day><volume>15</volume><issue>1</issue><fpage>e45772</fpage><pub-id pub-id-type="doi">10.2196/45772</pub-id><pub-id pub-id-type="medline">37213199</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bajwa</surname><given-names>J</given-names> </name><name name-style="western"><surname>Munir</surname><given-names>U</given-names> </name><name name-style="western"><surname>Nori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>B</given-names> </name></person-group><article-title>Artificial intelligence in healthcare: transforming the practice of medicine</article-title><source>Future Healthc J</source><year>2021</year><month>07</month><volume>8</volume><issue>2</issue><fpage>e188</fpage><lpage>e194</lpage><pub-id pub-id-type="doi">10.7861/fhj.2021-0095</pub-id><pub-id pub-id-type="medline">34286183</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schoenherr</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Abbas</surname><given-names>R</given-names> </name><name name-style="western"><surname>Michael</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rivas</surname><given-names>P</given-names> </name><name name-style="western"><surname>Anderson</surname><given-names>TD</given-names> </name></person-group><article-title>Designing AI using a human-centered approach: explainability and accuracy toward trustworthiness</article-title><source>IEEE Trans Technol Soc</source><year>2023</year><month>03</month><volume>4</volume><issue>1</issue><fpage>9</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1109/TTS.2023.3257627</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Ramakrishnan</surname><given-names>R</given-names> </name></person-group><article-title>The GenAI app step you&#x2019;re skimping on: evaluations</article-title><source>MIT Sloan Management Review</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://sloanreview.mit.edu/article/the-genai-app-step-youre-skimping-on-evaluations/">https://sloanreview.mit.edu/article/the-genai-app-step-youre-skimping-on-evaluations/</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mosqueira-Rey</surname><given-names>E</given-names> </name><name name-style="western"><surname>Hern&#x00E1;ndez-Pereira</surname><given-names>E</given-names> </name><name name-style="western"><surname>Alonso-R&#x00ED;os</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bobes-Bascar&#x00E1;n</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fern&#x00E1;ndez-Leal</surname><given-names>&#x00C1;</given-names> </name></person-group><article-title>Human-in-the-loop machine learning: a state of the art</article-title><source>Artif Intell Rev</source><year>2023</year><month>04</month><volume>56</volume><issue>4</issue><fpage>3005</fpage><lpage>3054</lpage><pub-id pub-id-type="doi">10.1007/s10462-022-10246-w</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Memarian</surname><given-names>B</given-names> </name><name name-style="western"><surname>Doleck</surname><given-names>T</given-names> </name></person-group><article-title>Human-in-the-loop in artificial intelligence in education: a review and entity-relationship (ER) analysis</article-title><source>Computers in Human Behavior: Artificial Humans</source><year>2024</year><month>01</month><volume>2</volume><issue>1</issue><fpage>100053</fpage><pub-id pub-id-type="doi">10.1016/j.chbah.2024.100053</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cruz Rivera</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>AW</given-names> </name><etal/></person-group><article-title>Guidelines for clinical trial protocols for interventions involving artificial intelligence: the SPIRIT-AI extension</article-title><source>The Lancet Digital Health</source><year>2020</year><month>10</month><volume>2</volume><issue>10</issue><fpage>e549</fpage><lpage>e560</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(20)30219-3</pub-id><pub-id pub-id-type="medline">33015597</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Cruz Rivera</surname><given-names>S</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Calvert</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Denniston</surname><given-names>AK</given-names> </name><collab>SPIRIT-AI and CONSORT-AI Working Group</collab></person-group><article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI extension</article-title><source>Nat Med</source><year>2020</year><month>09</month><volume>26</volume><issue>9</issue><fpage>1364</fpage><lpage>1374</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-1034-x</pub-id><pub-id pub-id-type="medline">32908283</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reddy</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rogers</surname><given-names>W</given-names> </name><name name-style="western"><surname>Makinen</surname><given-names>VP</given-names> </name><etal/></person-group><article-title>Evaluation framework to guide implementation of AI systems into healthcare settings</article-title><source>BMJ Health Care Inform</source><year>2021</year><month>10</month><volume>28</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1136/bmjhci-2021-100444</pub-id><pub-id pub-id-type="medline">34642177</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>S</given-names> </name><name name-style="western"><surname>Abuhmed</surname><given-names>T</given-names> </name><name name-style="western"><surname>El-Sappagh</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Explainable artificial intelligence (XAI): What we know and what is left to attain trustworthy artificial intelligence</article-title><source>Information Fusion</source><year>2023</year><month>11</month><volume>99</volume><fpage>101805</fpage><pub-id pub-id-type="doi">10.1016/j.inffus.2023.101805</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obermeyer</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Powers</surname><given-names>B</given-names> </name><name name-style="western"><surname>Vogeli</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mullainathan</surname><given-names>S</given-names> </name></person-group><article-title>Dissecting racial bias in an algorithm used to manage the health of populations</article-title><source>Science</source><year>2019</year><month>10</month><day>25</day><volume>366</volume><issue>6464</issue><fpage>447</fpage><lpage>453</lpage><pub-id pub-id-type="doi">10.1126/science.aax2342</pub-id><pub-id pub-id-type="medline">31649194</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehrabi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Morstatter</surname><given-names>F</given-names> </name><name name-style="western"><surname>Saxena</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lerman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Galstyan</surname><given-names>A</given-names> </name></person-group><article-title>A survey on bias and fairness in machine learning</article-title><source>ACM Comput Surv</source><year>2022</year><month>07</month><day>31</day><volume>54</volume><issue>6</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3457607</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maenhout</surname><given-names>L</given-names> </name><name name-style="western"><surname>Peuters</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cardon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Compernolle</surname><given-names>S</given-names> </name><name name-style="western"><surname>Crombez</surname><given-names>G</given-names> </name><name name-style="western"><surname>DeSmet</surname><given-names>A</given-names> </name></person-group><article-title>Participatory development and pilot testing of an adolescent health promotion chatbot</article-title><source>Front Public Health</source><year>2021</year><volume>9</volume><fpage>724779</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2021.724779</pub-id><pub-id pub-id-type="medline">34858919</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lau-Min</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Marini</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NK</given-names> </name><etal/></person-group><article-title>Pilot study of a mobile phone chatbot for medication adherence and toxicity management among patients with GI cancers on capecitabine</article-title><source>JCO Oncol Pract</source><year>2024</year><month>04</month><volume>20</volume><issue>4</issue><fpage>483</fpage><lpage>490</lpage><pub-id pub-id-type="doi">10.1200/OP.23.00365</pub-id><pub-id pub-id-type="medline">38237102</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Balaji</surname><given-names>G</given-names> </name><etal/></person-group><article-title>AlerTiger: deep learning for AI model health monitoring at linkedin</article-title><year>2023</year><month>08</month><day>6</day><conf-name>KDD &#x2019;23</conf-name><conf-date>Aug 6, 2023</conf-date><conf-loc>Long Beach CA USA</conf-loc><fpage>5350</fpage><lpage>5359</lpage><pub-id pub-id-type="doi">10.1145/3580305.3599802</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hong</surname><given-names>C</given-names> </name><etal/></person-group><article-title>The evaluation of generative AI should include repetition to assess stability</article-title><source>JMIR Mhealth Uhealth</source><year>2024</year><month>05</month><day>6</day><volume>12</volume><issue>1</issue><fpage>e57978</fpage><pub-id pub-id-type="doi">10.2196/57978</pub-id><pub-id pub-id-type="medline">38688841</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="web"><article-title>AI benchmarking dashboard</article-title><source>Epoch AI</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://epoch.ai/data/ai-benchmarking-dashboard">https://epoch.ai/data/ai-benchmarking-dashboard</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="web"><article-title>Falling LLM token prices and what they mean for AI companies,&#x201D; falling LLM token prices and what they mean for AI companies</article-title><source>The Batch</source><access-date>2025-01-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.deeplearning.ai/the-batch/falling-llm-token-prices-and-what-they-mean-for-ai-companies/">https://www.deeplearning.ai/the-batch/falling-llm-token-prices-and-what-they-mean-for-ai-companies/</ext-link></comment></nlm-citation></ref></ref-list></back></article>