<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v4i1e75030</article-id>
      <article-id pub-id-type="pmid">41118647</article-id>
      <article-id pub-id-type="doi">10.2196/75030</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Aiding Large Language Models Using Clinical Scoresheets for Neurobehavioral Diagnostic Classification From Text: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhou</surname>
            <given-names>Weipeng</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Guo</surname>
            <given-names>Jielong</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Kaiying</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7691-1407</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Rasool</surname>
            <given-names>Abdur</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5334-9001</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Surabhi</surname>
            <given-names>Saimourya</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1707-0537</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Mutlu</surname>
            <given-names>Cezmi</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9263-9332</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Haopeng</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-7017-0717</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Wall</surname>
            <given-names>Dennis P</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7889-9146</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Washington</surname>
            <given-names>Peter</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <address>
            <institution>University of California, San Francisco</institution>
            <addr-line>10 Koret Way, #323 San Francisco CA</addr-line>
            <addr-line>San Francisco, 94117</addr-line>
            <country>United States</country>
            <phone>1 415 353 2067</phone>
            <email>Peter.Washington@ucsf.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3276-4411</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Institute of Linguistics</institution>
        <institution>Academia Sinica</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>University of Hawaiʻi at Mānoa</institution>
        <addr-line>Honolulu, HI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Stanford University</institution>
        <addr-line>Stanford, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>University of California, San Francisco</institution>
        <addr-line>San Francisco</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Peter Washington <email>Peter.Washington@ucsf.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>21</day>
        <month>10</month>
        <year>2025</year>
      </pub-date>
      <volume>4</volume>
      <elocation-id>e75030</elocation-id>
      <history>
        <date date-type="received">
          <day>28</day>
          <month>3</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>5</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>8</day>
          <month>7</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>8</day>
          <month>8</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Kaiying Lin, Abdur Rasool, Saimourya Surabhi, Cezmi Mutlu, Haopeng Zhang, Dennis P Wall, Peter Washington. Originally published in JMIR AI (https://ai.jmir.org), 21.10.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2025/1/e75030" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs) have demonstrated the ability to perform complex tasks traditionally requiring human intelligence. However, their use in automated diagnostics for psychiatry and behavioral sciences remains under-studied.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to evaluate whether incorporating structured clinical assessment scales improved the diagnostic performance of LLM-based chatbots for neuropsychiatric conditions (we evaluated autism spectrum disorder, aphasia, and depression datasets) across two prompting strategies: (1) direct diagnosis and (2) code generation. We aimed to contextualize LLM-based diagnostic performance by benchmarking it against prior work that applied traditional machine learning classifiers to the same datasets, allowing us to assess whether LLMs offer competitive or complementary capabilities in clinical classification tasks.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We tested two approaches using ChatGPT, Gemini, and Claude models: (1) direct diagnostic querying and (2) execution of chatbot-generated code for classification. Three diagnostic datasets were used: ASDBank (autism spectrum disorder), AphasiaBank (aphasia), and Distress Analysis Interview Corpus-Wizard-of-Oz interviews (depression and related conditions). Each approach was evaluated with and without the aid of clinical assessment scales. Performance was compared to existing machine learning benchmarks on these datasets.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Across all 3 datasets, incorporating clinical assessment scales led to little improvement in performance, and results remained inconsistent and generally below those reported in previous studies. On the AphasiaBank dataset, the direct diagnosis approach using ChatGPT with GPT-4 produced a low <italic>F</italic><sub>1</sub>-score of 65.6% and specificity of 33%. The code generation method improved results, with ChatGPT with GPT-4o reaching an <italic>F</italic><sub>1</sub>-score of 81.4%, specificity of 78.6%, and sensitivity of 84.3%. ChatGPT with GPT-o3 and Gemini 2.5 Pro performed even better, with <italic>F</italic><sub>1</sub>-scores of 86.5% and 84.3%, respectively. For the ASDBank dataset, direct diagnosis results were lower, with <italic>F</italic><sub>1</sub>-scores of 56% for ChatGPT with GPT-4 and 54% for ChatGPT with GPT-4o. Under code generation, ChatGPT with GPT-o3 reached 67.9%, and Claude 3.5 performed reasonably well with 60%. Gemini 2.5 Pro failed to respond under this assessment condition. In the Distress Analysis Interview Corpus-Wizard-of-Oz dataset, direct diagnosis yielded high accuracy (70.9%) but poor <italic>F</italic><sub>1</sub>-scores of 8% using ChatGPT with GPT-4o. Code generation improved specificity—88.6% with ChatGPT with GPT-4o—but <italic>F</italic><sub>1</sub>-scores remained low overall. These findings suggest that, while clinical scales may help structure outputs, prompting alone remains insufficient for consistent diagnostic accuracy.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Current LLM-based chatbots, when prompted naively, underperform on psychiatric and behavioral diagnostic tasks compared to specialized machine learning models. Clinical assessment scales might modestly aid chatbot performance, but more sophisticated prompt engineering and domain integration are likely required to reach clinically actionable standards.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>neurological diagnostics</kwd>
        <kwd>classification</kwd>
        <kwd>large language model</kwd>
        <kwd>LLM</kwd>
        <kwd>chatbot</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Large language models (LLMs) have recently demonstrated capabilities that closely approximate or exceed human cognitive functions in various domains [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Given their efficacy in executing complex tasks, there is a burgeoning interest in exploring the potential applications of LLMs in clinical settings, including in areas such as providing emotional support [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>] and mental health diagnoses [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. The diagnostic process for neurobehavioral conditions typically encompasses comprehensive clinical assessments and longitudinal behavioral observations [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. The integration of LLMs (as well as other machine learning [ML] models) into this process could potentially streamline this complex and time-consuming diagnostic procedure by facilitating automated screening processes [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. While some research highlights some of the potential challenges of using LLMs for these tasks [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>], they are emerging as a promising avenue for developing scalable and accessible screening services.</p>
        <p>ChatGPT [<xref ref-type="bibr" rid="ref19">19</xref>], Gemini [<xref ref-type="bibr" rid="ref20">20</xref>], and Claude [<xref ref-type="bibr" rid="ref21">21</xref>], prominent LLM-based conversational agents, have been the subject of evaluation for their potential in digital neurobehavioral diagnostics. Previous studies have indicated that the capabilities of chatbots for neurobehavioral classification remain limited even when assessing specific conditions or smaller patient cohorts [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. In response to these findings, subsequent research efforts have focused on enhancing ChatGPT’s performance in this domain [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], typically using varied prompting strategies such as the formulation of precise inquiries and the provision of relevant contextual information.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>Building on these foundational works, we aimed to evaluate the use of LLM-based chatbots to aid in automated diagnostics for neuropsychiatric conditions using assessment scales. We evaluated two paradigms: (1) directly deriving diagnoses from textual data and (2) chatbot-generated code executed in a local environment for diagnostic classification. As an attempt to reach clinical relevance, we instructed the chatbots to either provide ratings on standardized clinical assessment scales which were then used to derive a final diagnosis or to incorporate these ratings into their diagnostic decision making. This approach aimed to leverage established clinical approaches to diagnosis while harnessing the analytical capabilities of LLMs. We hypothesized that offering this clinically grounded method for automated diagnosis would lead to improved diagnostic performance.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>We implemented 4 distinct methodologies to evaluate the diagnostic capabilities of the chatbots (<xref rid="figure1" ref-type="fig">Figure 1</xref>). In the direct diagnosis approach without assessment scales, we directly input data into the conversational artificial intelligence (AI) model, which then generated classification results. This process involved providing the chatbot with the processed dataset as input data and defining its primary task as providing neurobehavioral classification results for the condition of interest. We instructed the chatbot to derive diagnostic classifications for all participants using the text data from the entire processed dataset. If the chatbot indicated an inability to perform this task and requested to run in our environment, we directed it to use its pretrained knowledge to complete the task (see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for the detailed prompts of each condition).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The 4 methods we explored in this study. The top 2 panels illustrate the direct diagnosis approach with and without the use of assessment scales, which require the chatbots to directly provide predictions or ratings on assessment scales. The bottom 2 panels, consisting of the code generation approach with and without assessment scales, require the chatbots to generate code that is subsequently executed in a Python environment.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e75030_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The direct diagnosis approach using assessment scales involved inputting both data and a clinical assessment scale into the model; the model was subsequently tasked with rating items on the scale and providing these ratings as output. We then applied predefined thresholds from the clinical assessment scales to each data subject’s ratings to derive final neurobehavioral diagnoses.</p>
        <p>For both direct diagnosis conditions, we performed zero-shot classification without conducting training and testing splits, as we aimed to evaluate the models’ ability to generalize from their pretrained knowledge. This process was repeated 5 times for each condition, with results averaged across iterations to improve robustness.</p>
        <p>In addition to direct diagnosis, we explored a code generation approach to determine whether LLM-based chatbots could perform automated neurobehavioral classification by externalizing their reasoning into executable models. The motivation for the code generation condition stemmed from the observation that LLMs such as ChatGPT often struggle with directly solving complex reasoning tasks (at the time of our experiments) but can excel at generating code that reliably solves these problems. A notable example is that while ChatGPT frequently fails to correctly solve math puzzles through direct reasoning, it can generate Python code that solves them efficiently when executed. Recent studies have examined this phenomenon, revealing that LLMs perform poorly on complex logic-based tasks when relying solely on their internal reasoning capabilities yet demonstrate improved performance when prompted to generate code that encodes the required logic [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. This suggests that the reasoning and structure embedded in generated code may enable LLMs to circumvent some of the limitations they face during direct response generation. While the diagnostic processes under the code generation condition differed fundamentally from those under the direct diagnosis condition, they offer a complementary perspective on the models’ capabilities. Notably, the chatbots frequently produced executable code—even without explicit prompting—in our early pilot tests.</p>
        <p>In the code generation approach without assessment scales, which served as a control condition, we fed the processed data as input into the conversational AI model. Following the data review, we instructed the chatbot to select what it deemed the most appropriate algorithm for the task and output the corresponding Python code. This code was subsequently executed in an external Python environment (Python Software Foundation). We tasked the chatbot with conducting stratified 5-fold cross-validation on the dataset, reporting <italic>F</italic><sub>1</sub>-score, specificity, sensitivity, and accuracy as performance metrics. To optimize results, we engaged in an iterative process with the chatbot, requesting performance improvements until the generated code produced results consistent with its previous 2 iterations.</p>
        <p>The code generation approach using assessment scales began with providing the chatbot with the processed data as input followed by a standardized assessment scale. We then prompted the model to generate the code and apply established cutoff thresholds from these scales as output to determine the final diagnosis. However, observing that this often resulted in unsatisfactory classifications, we next encouraged the chatbots to incorporate these ratings into an ML algorithm of their own design. The chatbots produced the algorithm, which we then ran in our local environment. If no further performance improvements from the condition without assessment scales were observed, we directed the chatbot to revert to the algorithm used in the condition without the assessment scale and integrate the assessment scale ratings into the training procedures. This methodology facilitated a direct comparison of performance between 2 code generation approaches, making any potential improvements attributable to the assessment scale approach. (see <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for an example of generated code).</p>
        <p><xref ref-type="table" rid="table1">Table 1</xref> shows the final algorithm used in each code generation condition. We ensured that the chatbots incorporated the assessment scale ratings into their algorithms, requesting integration if they were initially omitted. The iterative process for each condition continued until the performance of the generated code reached a plateau with no significant improvement observed over 2 consecutive iterations.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Machine learning algorithms produced by the chatbots in the 2 code generation conditions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="160"/>
            <col width="0"/>
            <col width="250"/>
            <col width="0"/>
            <col width="260"/>
            <col width="0"/>
            <col width="300"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Approach</td>
                <td colspan="5">Algorithms</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">AphasiaBank</td>
                <td colspan="2">ASDBank</td>
                <td>DAIC-WOZ<sup>a</sup> database</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="8"><bold>Code generation</bold>―<bold>no assessment scale</bold></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">TF-IDF<sup>b</sup>+LR<sup>c</sup></td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">CountVectorizer+LR</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">Word Count</td>
                <td colspan="2">CountVectorizer+LR</td>
                <td colspan="2">TF-IDF+XGB<sup>d</sup> classifier</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td colspan="2">Word Count</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">TF-IDF+LR</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">TF-IDF+LinearSVC</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">TF-IDF+LinearSVC</td>
                <td colspan="2">Sentence embedding+LinearSVC</td>
              </tr>
              <tr valign="top">
                <td colspan="8"><bold>Code generation</bold>―<bold>assessment scale</bold></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">CountVectorizer+LR</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">Word Count+LR</td>
                <td colspan="2">CountVectorizer+LR</td>
                <td colspan="2">TF-IDF+XGB classifier</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td colspan="2">Word Count+threshold</td>
                <td colspan="2">TF-IDF+RF<sup>e</sup></td>
                <td colspan="2">TF-IDF+LR</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">TF-IDF+LinearSVC</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">TF-IDF+LR</td>
                <td colspan="2">—<sup>f</sup></td>
                <td colspan="2">Sentence embedding+LinearSVC</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>DAIC-WOZ: Distress Analysis Interview Corpus-Wizard-of-Oz.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>TF-IDF: term frequency–inverse document frequency.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>XGB: extreme gradient boosting.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>RF: random forest.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>To assess statistical significance, we conducted 1000-fold permutation tests on the <italic>F</italic><sub>1</sub>-score and accuracy to compare (1) direct diagnosis and code generation in non–assessment scale setups and (2) non–assessment scale and assessment scale setups. For comparisons between direct diagnosis and code generation, we aligned predictions by matching test sets across folds, comparing the performance on the same data points in both conditions and averaging the results across folds. For the checklist comparison, we ran permutation tests over all predictions from the respective conditions, also averaging across folds. Comparisons were omitted in cases in which the prediction patterns were apparently random or when performance in the latter condition was lower than in the former (ie, assessment scale&#60;non–assessment scale or code generation&#60;direct diagnosis).</p>
      </sec>
      <sec>
        <title>Datasets</title>
        <p>We used 3 distinct databases, each focusing on a specific neurobehavioral condition. Two of these, ASDBank [<xref ref-type="bibr" rid="ref26">26</xref>] and AphasiaBank [<xref ref-type="bibr" rid="ref27">27</xref>], are sourced from TalkBank [<xref ref-type="bibr" rid="ref28">28</xref>] and contain language samples for autism spectrum disorder (ASD) and aphasia, respectively, whereas the third database, the Distress Analysis Interview Corpus-Wizard-of-Oz (DAIC-WOZ) database [<xref ref-type="bibr" rid="ref29">29</xref>], contains textual data from patients with depression, anxiety, and posttraumatic stress disorder.</p>
        <p>AphasiaBank [<xref ref-type="bibr" rid="ref27">27</xref>] is a repository containing multimedia language samples from both participants with aphasia and control participants. These samples were collected through standardized discourse tasks, including unstructured speech samples, picture descriptions, story narratives, and procedural discourse.</p>
        <p>ASDBank [<xref ref-type="bibr" rid="ref26">26</xref>] comprises a collection of language samples and interactions from individuals diagnosed with ASD. The data within ASDBank include transcribed audio and video recordings of clinical interviews and naturalistic interactions.</p>
        <p>We used all available English-language transcripts from both AphasiaBank and ASDBank. Data processing was performed to consolidate all samples from a single participant into 1 data point. The resulting dataset comprised 715 aphasia data points and 352 control data points for AphasiaBank and 34 ASD data points and 44 control data points for ASDBank.</p>
        <p>The DAIC-WOZ database [<xref ref-type="bibr" rid="ref29">29</xref>] consists of semistructured interviews conducted by a simulated agent designed to identify symptoms of depression and posttraumatic stress disorder. These interviews include questions about personal experiences, quality of life, and emotions. We consolidated all samples from a participant, including the interviewer’s input, into a single data point. The DAIC-WOZ database includes 56 patient data points and 133 control data points.</p>
        <p><xref ref-type="table" rid="table2">Table 2</xref> provides a summary of the diagnosis distribution across each dataset.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The number of control and patient data points in each of the datasets we evaluated.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="370"/>
            <col width="280"/>
            <col width="350"/>
            <thead>
              <tr valign="top">
                <td>Database</td>
                <td>Number of control data points</td>
                <td>Number of data points for condition of interest</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>AphasiaBank</td>
                <td>352</td>
                <td>715</td>
              </tr>
              <tr valign="top">
                <td>ASDBank</td>
                <td>44</td>
                <td>34</td>
              </tr>
              <tr valign="top">
                <td>DAIC-WOZ<sup>a</sup></td>
                <td>133</td>
                <td>56</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>DAIC-WOZ: Distress Analysis Interview Corpus-Wizard-of-Oz.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Aphasia, depression, and ASD each manifest distinct linguistic characteristics that are both overlapping and unique. Aphasia, typically resulting from brain damage, is characterized by impaired language production and comprehension, often including repetitive language and the frequent use of filler words as individuals struggle to retrieve or organize words effectively [<xref ref-type="bibr" rid="ref30">30</xref>]. Depression, while primarily a mood disorder, affects language through reduced verbal output, monotone speech, and a preference for negative or self-critical language patterns. Depressive language, such as expressions of negativity, can be a key symptom of the condition. Another characteristic linguistic feature is an excessive number of sighs, reflecting physical or emotional fatigue. ASD is marked by unique communication challenges, including delayed speech development, echolalia (repetition of phrases), difficulty with pragmatic language (eg, understanding sarcasm or social cues), and overly literal or formal speech. Individuals with ASD may also exhibit fragmented sentences and frequent use of filler words, reflecting challenges in organizing thoughts or navigating social interactions [<xref ref-type="bibr" rid="ref31">31</xref>].</p>
        <p>Many previous studies have leveraged the datasets we used in our research. However, much of the existing work has focused on advanced tasks such as multimodal detection or severity classification rather than simpler text-based binary classification using chatbots. These studies have often achieved strong (although not clinically translatable) performances, frequently exceeding 80% in <italic>F</italic><sub>1</sub>-scores or accuracy. For example, Dinkel et al [<xref ref-type="bibr" rid="ref32">32</xref>] applied a text-based multitask network to the DAIC-WOZ dataset, achieving an <italic>F</italic><sub>1</sub>-score of 0.84 for binary detection. Similarly, Agrawal and Mishra [<xref ref-type="bibr" rid="ref33">33</xref>] used a fused bidirectional encoder representation from transformers–a bidirectional long short-term memory model integrated with Extreme Gradient Boosting to perform binary classification, achieving an <italic>F</italic><sub>1</sub>-score of 91%.</p>
        <p>For the AphasiaBank dataset, most previous studies have focused on severity classification, making direct comparisons with our binary classification study challenging. The only relevant work, conducted by Cong et al [<xref ref-type="bibr" rid="ref34">34</xref>], found that using LLM-derived surprisal features facilitated detection, achieving 79% in both accuracy and <italic>F</italic><sub>1</sub>-score. Similarly, studies involving the ASDBank dataset are limited, partly due to its recent development. Chu et al [<xref ref-type="bibr" rid="ref35">35</xref>] included another dataset, the Child Language Data Exchange System, as a source of healthy control data. By extracting a few linguistic features from these 2 datasets, their binary classification approaches reached an <italic>F</italic><sub>1</sub>-scores of over 80% [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>These studies suggest that LLM-based models directly diagnosing from the datasets used in this study should achieve high performance if chatbots exhibit comparable classification capabilities to those models in the previous studies.</p>
      </sec>
      <sec>
        <title>Models</title>
        <p>We evaluated 2 approaches using 3 types of state-of-the-art conversational AI models: ChatGPT with GPT-4, ChatGPT with GPT-4o, and ChatGPT with GPT-o3 (OpenAI); Gemini 2.5 Pro (Google AI); and Claude 3.5 Sonnet (Anthropic). These models were selected because they are some of the most widely used modern LLMs and because their efficacy in neurobehavioral classification tasks remains underexamined in the current literature. Notably, models such as Gemini 2.5 Pro and ChatGPT with GPT-o3 incorporate built-in prompting strategies such as chain-of-thought reasoning, allowing us to examine how such strategies influence performance. We excluded open models such as Llama because they do not support file input and including them would require a different approach from that used for the other models we tested.</p>
      </sec>
      <sec>
        <title>Assessment Scales</title>
        <p>We incorporated 3 widely recognized assessment scales and checklists used in clinical settings. We selected scales that assess behaviors at least tangentially related to language and that do not require extended observation periods. For example, the Autism Spectrum Quotient evaluates traits such as social preferences (“S/he prefers to do things with others rather than on her/his own”), behavioral patterns (“S/he prefers to do things the same way over and over again”), and attention capabilities (“have difficulty sustaining attention in tasks or fun activities”). The rating system for this checklist—<italic>definitely disagree</italic>, <italic>slightly disagree</italic>, <italic>slightly agree</italic>, and <italic>definitely agree</italic>—does not necessitate longitudinal observation, unlike scales that use time-sensitive ratings such as <italic>rarely</italic>, <italic>less often</italic>, <italic>very often</italic>, and <italic>always</italic>.</p>
        <p>The assessment scales and checklists included in our study were as follows: (1) the fluency test in the Western Aphasia Battery–Aphasia Quotient (AphasiaBank) [<xref ref-type="bibr" rid="ref36">36</xref>], (2) the Autism Spectrum Quotient (ASDBank) [<xref ref-type="bibr" rid="ref37">37</xref>], and (3) Burn’s Depression Checklist [<xref ref-type="bibr" rid="ref38">38</xref>] (DAIC-WOZ database).</p>
        <p>In the 2 direct diagnosis conditions, we conducted the experimental procedure 5 times and obtained results based on the entirety of each dataset. We did not perform a training and testing split for these conditions, opting instead for a zero-shot classification approach to assess the models’ ability to generalize from their pretrained knowledge. However, in the code generation conditions, we instructed the chatbot to perform stratified 5-fold cross-validation on the entire dataset. The training and testing split ratio during each fold was 4:1. Results were evaluated based on the test sets generated during each fold and subsequently averaged.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study did not involve the recruitment of human participants or the collection of new data. All analyses were conducted on publicly available, deidentified datasets<bold>―</bold>AphasiaBank, the DAIC-WOZ database, and ASDBank<bold>―</bold>that are widely used in research and do not contain personally identifiable information. As such, no application for ethics review was submitted. This approach is consistent with institutional and regional guidelines that exempt studies using publicly available, deidentified data from human subjects review.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Core Results</title>
        <p><xref ref-type="table" rid="table3">Tables 3</xref> to <xref ref-type="table" rid="table8">8</xref> present the cross-validation results of the 2 approaches applied to each dataset, reporting accuracy, <italic>F</italic><sub>1</sub>-score, specificity, and sensitivity. Performance under the direct diagnosis conditions varied across datasets.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Results of 4 approaches on the AphasiaBank dataset in the direct diagnosis condition.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="280"/>
            <col width="0"/>
            <col width="170"/>
            <col width="180"/>
            <col width="0"/>
            <col width="170"/>
            <col width="0"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td>Accuracy</td>
                <td colspan="2"><italic>F</italic><sub>1</sub>-score</td>
                <td colspan="2">Specificity</td>
                <td>Sensitivity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">Results from Cong et al [<xref ref-type="bibr" rid="ref34">34</xref>]</td>
                <td>0.79</td>
                <td colspan="2">0.79</td>
                <td colspan="2">―<sup>a</sup></td>
                <td>0.79</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>No assessment scale, mean (SD)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.567 (0.1)</td>
                <td>0.6556 (0.136)</td>
                <td colspan="2">0.33 (0.3)</td>
                <td colspan="2">0.684 (0.29)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.561 (0.029)</td>
                <td>0.648 (0.111)</td>
                <td colspan="2">0.397 (0.11)</td>
                <td colspan="2">0.642 (0.22)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.49 (0.06)</td>
                <td>0.544 (0.113)</td>
                <td colspan="2">0.328 (0.01)</td>
                <td colspan="2">0.665 (0.01)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">0.508 (0.01)</td>
                <td>0.599 (0.012)</td>
                <td colspan="2">0.317 (0.02)</td>
                <td colspan="2">0.659 (0.013)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Assessment scale, mean (SD)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.293 (0.34)<sup>b</sup></td>
                <td>0.358 (0.376)<sup>b</sup></td>
                <td colspan="2">0.297 (0.187)</td>
                <td colspan="2">0.647 (0.09)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.497 (0.01)<sup>b</sup></td>
                <td>0.55 (0.02)<sup>b</sup></td>
                <td colspan="2">0.577 (0.02)</td>
                <td colspan="2">0.458 (0.02)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.555 (0.183)<sup>c</sup></td>
                <td>0.568 (0.4)<sup>c</sup></td>
                <td colspan="2">0.108 (0.19)</td>
                <td colspan="2">0.645 (0.037)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">0.661 (0.07)<sup>b</sup></td>
                <td>0.792 (0.003)<sup>b</sup></td>
                <td colspan="2">0.381 (0.08)</td>
                <td colspan="2">0.672 (0.003)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Missing data.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>No test conducted.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup><italic>P</italic>&#60;.001 for GPT-o3 accuracy; <italic>P&#60;</italic>.001 for <italic>F</italic><sub>1</sub>-score (no assessment scale vs assessment scale).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Results of 4 approaches on the AphasiaBank dataset in the code generation condition.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="160"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="210"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Accuracy, mean (SD)</td>
                <td><italic>F</italic><sub>1</sub>-score, mean (SD)</td>
                <td>Specificity, mean (SD)</td>
                <td>Sensitivity, mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>No assessment scale</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>0.67 (0.16)<sup>a</sup></td>
                <td>0.74 (0.17)<sup>a</sup></td>
                <td>0.79 (0.24)</td>
                <td>0.40 (0.31)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td>0.67 (0.0113)<sup>a</sup></td>
                <td>0.802 (0.008)<sup>a</sup></td>
                <td>0.68 (0.011)</td>
                <td>1 (0)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td>0.835 (0.035)<sup>a</sup></td>
                <td>0.865 (0.029)<sup>a</sup></td>
                <td>0.920 (0.077)</td>
                <td>0.793 (0.041)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td>0.605 (0.034)<sup>b</sup></td>
                <td>0.623 (0.036)<sup>b</sup></td>
                <td>0.844 (0.037)</td>
                <td>0.488 (0.033)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td>0.7882 (0.02)<sup>a</sup></td>
                <td>0.8429 (0.016)<sup>a</sup></td>
                <td>0.6645 (0.057)</td>
                <td>0.8490 (0.031)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Assessment scale</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>0.67 (0.16)<sup>b</sup></td>
                <td>0.74 (0.17)<sup>b</sup></td>
                <td>0.80 (0.25)</td>
                <td>0.41 (0.30)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td>0.741 (0.022)<sup>c</sup></td>
                <td>0.814 (0.016)<sup>c</sup></td>
                <td>0.786 (0.024)</td>
                <td>0.843 (0.007)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td>0.835 (0.035)<sup>b</sup></td>
                <td>0.865 (0.029)<sup>b</sup></td>
                <td>0.920 (0.077)</td>
                <td>0.793 (0.041)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td>0.608 (0.036)<sup>b</sup></td>
                <td>0.627 (0.039)<sup>b</sup></td>
                <td>0.844 (0.037)</td>
                <td>0.492 (0.036)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td>0.7891 (0.021)<sup>b</sup></td>
                <td>0.8437 (0.015)<sup>b</sup></td>
                <td>0.6674 (0.072)</td>
                <td>0.8490 (0.024)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup><italic>P</italic>&#60;.001 for GPT-4 accuracy; <italic>P</italic>&#60;.001 for GPT-4 <italic>F</italic><sub>1</sub>-score; <italic>P</italic>&#60;.001 for GPT-4o accuracy; <italic>P</italic>&#60;.001 for GPT-4o <italic>F</italic><sub>1</sub>-score; <italic>P</italic>&#60;.001 for GPT-o3 accuracy; <italic>P</italic>&#60;.001 for GPT-o3 <italic>F</italic><sub>1</sub>-score; <italic>P</italic>&#60;.001 for Gemini 2.5 Pro accuracy; <italic>P&#60;</italic>.001 for Gemini 2.5 Pro <italic>F</italic><sub>1</sub>-score (direct diagnosis versus code generation in non–assessment scale setups when marked in the “No assessment scale” section)</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>No test conducted.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup><italic>P</italic>=.07 for GPT-4o accuracy; <italic>P</italic>=.06 for GPT-4o F1-score (assessment vs no assessment).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Results of 4 approaches on the ASDBank dataset in the direct diagnosis condition.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="280"/>
            <col width="0"/>
            <col width="170"/>
            <col width="180"/>
            <col width="0"/>
            <col width="170"/>
            <col width="0"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td>Accuracy</td>
                <td colspan="2"><italic>F</italic><sub>1</sub>-score</td>
                <td colspan="2">Specificity</td>
                <td>Sensitivity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">Results from Chu et al [<xref ref-type="bibr" rid="ref35">35</xref>]</td>
                <td>0.76</td>
                <td colspan="2">0.85</td>
                <td colspan="2">0.2</td>
                <td>0.94</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>No assessment scale, mean (SD)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.5 (0.00)</td>
                <td>0.598 (0.00)</td>
                <td colspan="2">0.227 (0.00)</td>
                <td colspan="2">0.853 (0.00)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.421 (0.03)</td>
                <td>0.514 (0.129)</td>
                <td colspan="2">0.155 (0.212)</td>
                <td colspan="2">0.765 (0.323)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.6026 (0.00)</td>
                <td>0.575 (0.00)</td>
                <td colspan="2">0.667 (0.00)</td>
                <td colspan="2">0.575 (0.00)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">0.485 (0.08)</td>
                <td>0.449 (0.09)</td>
                <td colspan="2">0.549 (0.08)</td>
                <td colspan="2">0.421 (0.08)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Assessment scale, mean (SD)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.427 (0.01)<sup>a</sup></td>
                <td>0.56 (0.08)<sup>a</sup></td>
                <td colspan="2">0.09 (0.157)</td>
                <td colspan="2">0.863 (0.24)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.491 (0.09)<sup>a</sup></td>
                <td>0.542 (0.117)<sup>a</sup></td>
                <td colspan="2">0.236 (0.39)</td>
                <td colspan="2">0.802 (0.342)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.436 (0.00)<sup>a</sup></td>
                <td>0.607 (0.00)<sup>a</sup></td>
                <td colspan="2">0.00 (0.00)</td>
                <td colspan="2">0.436 (0.00)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>No test conducted.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Results of 4 approaches on the ASDBank dataset in the code generation condition.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="160"/>
            <col width="0"/>
            <col width="200"/>
            <col width="210"/>
            <col width="0"/>
            <col width="190"/>
            <col width="0"/>
            <col width="210"/>
            <thead>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td>Accuracy, mean (SD)</td>
                <td colspan="2"><italic>F</italic><sub>1</sub>-score, mean (SD)</td>
                <td colspan="2">Specificity, mean (SD)</td>
                <td>Sensitivity, mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="9">
                  <bold>No assessment scale</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.618 (0.125)<sup>a</sup></td>
                <td>0.616 (0.104)<sup>a</sup></td>
                <td colspan="2">0.55 (0.286)</td>
                <td colspan="2">0.71 (0.199)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.653 (0.103)<sup>a</sup></td>
                <td>0.55 (0.184)<sup>a</sup></td>
                <td colspan="2">0.73 (0.303)</td>
                <td colspan="2">0.576 (0.378)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.679 (0.041)<sup>a</sup></td>
                <td>0.679 (0.041)<sup>a</sup></td>
                <td colspan="2">0.864 (0.083)</td>
                <td colspan="2">0.433 (0.195)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td colspan="2">0.68 (0.16)<sup>b</sup></td>
                <td>0.6 (0.22)<sup>b</sup></td>
                <td colspan="2">0.67 (0.35)</td>
                <td colspan="2">0.69 (0.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">0.74 (0.09)<sup>a</sup></td>
                <td>0.63 (0.14)<sup>a</sup></td>
                <td colspan="2">0.52 (0.16)</td>
                <td colspan="2">0.91 (0.08)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Assessment scale</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.642 (0.165)<sup>c</sup></td>
                <td>0.628 (0.17)<sup>c</sup></td>
                <td colspan="2">0.6 (0.334)</td>
                <td colspan="2">0.695 (0.231)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.628 (0.194)<sup>b</sup></td>
                <td>0.592 (0.1974)<sup>b</sup></td>
                <td colspan="2">0.689 (0.325)</td>
                <td colspan="2">0.578 (0.257)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.679 (0.041)<sup>b</sup></td>
                <td>0.679 (0.041)<sup>b</sup></td>
                <td colspan="2">0.864 (0.083)</td>
                <td colspan="2">0.433 (0.195)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td colspan="2">0.64 (0.13)<sup>b</sup></td>
                <td>0.6 (0.23)<sup>b</sup></td>
                <td colspan="2">0.69 (0.41)</td>
                <td colspan="2">0.67 (0.36)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup><italic>P=</italic>.002 for GPT-4 accuracy; <italic>P=</italic>.001 for GPT-4 <italic>F</italic><sub>1</sub>-score; <italic>P=</italic>.03 for GPT-4o accuracy; <italic>P=</italic>.015 for GPT-4o <italic>F</italic><sub>1</sub>-score; <italic>P=</italic>.009 for GPT-o3 accuracy; <italic>P=</italic>.005 for GPT-o3 <italic>F</italic><sub>1</sub>-score; <italic>P=</italic>.006 for Gemini 2.5 Pro accuracy; <italic>P=</italic>.003 for Gemini 2.5 Pro <italic>F</italic><sub>1</sub>-score (direct diagnosis versus code generation in non–assessment scale setups when marked in the “No assessment scale” section)</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>No test conducted.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup><italic>P=</italic>.99 and <italic>P=</italic>.99 for GPT-4 accuracy and F1-score (assessment vs non-assessment).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table7">
          <label>Table 7</label>
          <caption>
            <p>Results of 4 approaches on the Distress Analysis Interview Corpus-Wizard-of-Oz (DAIC-WOZ) database in the direct diagnosis condition.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="350"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="160"/>
            <col width="0"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">Accuracy</td>
                <td colspan="2"><italic>F</italic><sub>1</sub>-score</td>
                <td colspan="2">Specificity</td>
                <td>Sensitivity</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3">Results from Dinkel et al [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
                <td colspan="2">0.86</td>
                <td colspan="2">0.84</td>
                <td colspan="2">―</td>
                <td>0.83</td>
              </tr>
              <tr valign="top">
                <td colspan="3">Results from Agrawal and Mishra [<xref ref-type="bibr" rid="ref33">33</xref>]</td>
                <td colspan="2">―</td>
                <td colspan="2">0.91</td>
                <td colspan="2">―</td>
                <td>0.89</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>No assessment scale, mean (SD)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.333 (0.04)</td>
                <td colspan="2">0.452 (0.04)</td>
                <td colspan="2">0.08 (0.51)</td>
                <td colspan="2">0.939 (0.039)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.623 (0.01)</td>
                <td colspan="2">0.346 (0.176)</td>
                <td colspan="2">0.711 (0.168)</td>
                <td colspan="2">0.409 (0.347)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.595 (0.05)</td>
                <td colspan="2">0.252 (0.12)</td>
                <td colspan="2">0.704 (0.02)</td>
                <td colspan="2">0.269 (0.05)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">0.616 (0.11)</td>
                <td colspan="2">0.222 (0.132)</td>
                <td colspan="2">0.700 (0.02)</td>
                <td colspan="2">0.294 (0.09)</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Assessment scale, mean (SD)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td colspan="2">0.56 (0.06)<sup>a</sup></td>
                <td colspan="2">0.416 (0.07)<sup>a</sup></td>
                <td colspan="2">0.56 (0.08)</td>
                <td colspan="2">0.516 (0.05)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td colspan="2">0.709 (0.05)<sup>a</sup></td>
                <td colspan="2">0.08 (0.01)<sup>a</sup></td>
                <td colspan="2">1 (0.00)</td>
                <td colspan="2">0.429 (0.006)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td colspan="2">0.635 (0.05)<sup>b</sup></td>
                <td colspan="2">0.281 (0.14)<sup>b</sup></td>
                <td colspan="2">0.72 (0.01)</td>
                <td colspan="2">0.355 (0.08)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td colspan="2">0.54 (0.07)<sup>a</sup></td>
                <td colspan="2">0.363 (0.09)<sup>a</sup></td>
                <td colspan="2">0.71 (0.06)</td>
                <td colspan="2">0.306 (0.08)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table7fn1">
              <p><sup>a</sup>No test conducted.</p>
            </fn>
            <fn id="table7fn2">
              <p><sup>b</sup><italic>P=</italic>.44 for GPT-4 accuracy and <italic>P=</italic>.43 for <italic>F</italic><sub>1</sub>-score (assessment vs no assessment).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table8">
          <label>Table 8</label>
          <caption>
            <p>Results of 4 approaches on the Distress Analysis Interview Corpus-Wizard-of-Oz (DAIC-WOZ) database in the code generation condition.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="180"/>
            <col width="200"/>
            <col width="190"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Accuracy, mean (SD)</td>
                <td><italic>F</italic><sub>1</sub>-score, mean (SD)</td>
                <td>Specificity, mean (SD)</td>
                <td>Sensitivity, mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>No assessment scale</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>0.624 (0.024)<sup>a</sup></td>
                <td>0.268 (0.047)<sup>a</sup></td>
                <td>0.79 (0.035)</td>
                <td>0.233 (0.048)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td>0.681 (0.126)<sup>a</sup></td>
                <td>0.2038 (0.1474)<sup>a</sup></td>
                <td>0.886 (0.087)</td>
                <td>0.2286 (0.2382)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td>0.6667 (0.0572)<sup>a</sup></td>
                <td>0.1472 (0.1636)<sup>a</sup></td>
                <td>0.1091 (0.1185)</td>
                <td>0.1091 (0.1185)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td>0.649 (0.103)<sup>b</sup></td>
                <td>0.2386 (0.113)<sup>b</sup></td>
                <td>0.7672 (0.0251)</td>
                <td>0.2136 (0.1131)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td>0.6138 (0.08)<sup>b</sup></td>
                <td>0.4037 (0.09)<sup>b</sup></td>
                <td>0.6846 (0.11)</td>
                <td>0.4439 (0.11)</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Assessment scale</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>0.63 (0.027)<sup>c</sup></td>
                <td>0.271 (0.05)<sup>c</sup></td>
                <td>0.797 (0.036)</td>
                <td>0.233 (0.048)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4o</td>
                <td>0.681 (0.1587)<sup>b</sup></td>
                <td>0.213 (0.1587)<sup>b</sup></td>
                <td>0.9 (0.073)</td>
                <td>0.223 (0.2389)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-o3</td>
                <td>0.619 (0.06)<sup>b</sup></td>
                <td>0.283 (0.161)<sup>b</sup></td>
                <td>0.768 (0.09)</td>
                <td>0.2682 (0.17)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Claude 3.5</td>
                <td>0.657 (0.109)<sup>c</sup></td>
                <td>0.33 (0.1153)<sup>c</sup></td>
                <td>0.7738 (0.1)</td>
                <td>0.328 (0.1153)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Gemini 2.5 Pro</td>
                <td>0.518 (0.068)<sup>b</sup></td>
                <td>0.4822 (0.037)<sup>b</sup></td>
                <td>0.5524 (0.13)</td>
                <td>0.478 (0.03)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table8fn1">
              <p><sup>a</sup><italic>P&#60;</italic>.001 for GPT-4 accuracy; <italic>P&#60;</italic>.001 for GPT-4 <italic>F</italic><sub>1</sub>-score; <italic>P&#60;</italic>.001 for GPT-4o accuracy; <italic>P&#60;</italic>.001 for GPT-4o <italic>F</italic><sub>1</sub>-score; <italic>P&#60;</italic>.001 for GPT-o3 accuracy; <italic>P&#60;</italic>.001 for GPT-o3 <italic>F</italic><sub>1</sub>-score (direct diagnosis versus code generation in non–assessment scale setups when marked in the “No assessment scale” section)</p>
            </fn>
            <fn id="table8fn2">
              <p><sup>b</sup>No test conducted.</p>
            </fn>
            <fn id="table8fn3">
              <p><sup>c</sup><italic>P=</italic>.80, <italic>P=</italic>.60 for GPT-4 accuracy and <italic>F</italic><sub>1</sub>-score; <italic>P=</italic>.30, <italic>P</italic>=.20 for Claude 3.5 accuracy and <italic>F</italic><sub>1</sub>-score (assessment vs non-assessment).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p><xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref> [<xref ref-type="bibr" rid="ref34">34</xref>] compare approaches on the AphasiaBank dataset against a baseline performance of 79% across metrics in the study by Cong et al [<xref ref-type="bibr" rid="ref34">34</xref>]. All of our direct diagnosis conditions yielded a lower performance than this baseline. Our code generation conditions improved results significantly, with ChatGPT with GPT-o3 achieving the highest <italic>F</italic><sub>1</sub>-score (0.865) and balanced specificity (0.92) and sensitivity (0.793), surpassing the baseline by Cong et al [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
        <p>The results on the ASDBank dataset were compared against the baseline results from Chu et al [<xref ref-type="bibr" rid="ref35">35</xref>], who achieved an <italic>F</italic><sub>1</sub>-score of 0.85 and a high sensitivity of 0.94, although specificity was notably low at 0.2. Our direct diagnosis approaches struggled in comparison, with ChatGPT with GPT-4 and ChatGPT with GPT-o3 producing lower <italic>F</italic><sub>1</sub>-scores (0.598 and 0.575, respectively) and poor specificity. The code generation condition significantly improved overall performance, with Claude 3.5 achieving the highest accuracy (0.68) and <italic>F</italic><sub>1</sub>-score (0.6). The other models also showed improvement, but their performance on specificity and sensitivity was less consistent. Gemini 2.5 Pro was unable to provide ratings on the checklist due to content restrictions related to ethical guidelines.</p>
        <p>For the DAIC-WOZ dataset, the studies by Dinkel et al [<xref ref-type="bibr" rid="ref32">32</xref>] and Agrawal and Mishra [<xref ref-type="bibr" rid="ref33">33</xref>] established strong baselines, achieving <italic>F</italic><sub>1</sub>-scores of 0.84 and 0.91, respectively, along with high accuracy and sensitivity. In comparison, our direct diagnosis approaches showed inconsistent performance, with ChatGPT with GPT-4o and ChatGPT with GPT-4 achieving the highest accuracy (0.623) and <italic>F</italic><sub>1</sub>-score (0.452)—notably low values—with even poorer results on the other metrics. While the code generation approaches yielded higher accuracy in some cases, they did not meaningfully improve overall performance as their <italic>F</italic><sub>1</sub>-scores were significantly lower than those of the direct diagnosis condition.</p>
        <p>We also note that most comparisons between assessment scale and no assessment scale conditions did not yield statistically significant differences except for ChatGPT with GPT-o3 and Gemini 2.5 Pro in the AphasiaBank direct diagnosis condition, which showed significant improvements in both accuracy and <italic>F</italic><sub>1</sub>-score.</p>
        <p>Overall, our findings reveal a substantial gap when using the 2 different approaches: code generation and direct diagnosis. While code generation and newer models seem to have improved performance compared to direct prompting, they still did not reach the levels reported in previous studies in most cases. Both approaches fell short of established benchmarks, underscoring the limitations of current LLM-based diagnostic methods that rely solely on prompting without model fine-tuning.</p>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <sec>
          <title>Overview</title>
          <p>We first address the errors in the direct diagnosis approach, which did not appear to work well. We observed that most rounds of classification yielded close-to-random performances, especially for older models (ChatGPT with GPT-4 and ChatGPT with GPT-4o). Interestingly, we noticed patterns in the classification ratings produced, such as digits limited to only multiples of 3 or repeating sequences (eg, 3, 2, 1, 0, 3, 2, 1, 0). We present the percentage of rounds over 5 rounds of classification that followed such patterns in <xref ref-type="table" rid="table9">Table 9</xref>. This demonstrates that a direct diagnosis prompting strategy does not work well if models are presented with the entire dataset at once.</p>
          <table-wrap position="float" id="table9">
            <label>Table 9</label>
            <caption>
              <p>Percentage of random predictions.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="210"/>
              <col width="0"/>
              <col width="220"/>
              <col width="0"/>
              <col width="180"/>
              <col width="0"/>
              <col width="180"/>
              <col width="0"/>
              <col width="180"/>
              <thead>
                <tr valign="top">
                  <td colspan="3">Database and approach</td>
                  <td colspan="2">GPT-4 random predictions n=5 (%)</td>
                  <td colspan="2">GPT-4o random predictions n=5 (%)</td>
                  <td colspan="2">GPT-o3 random predictions n=5 (%)</td>
                  <td>Gemini 2.5 Pro random predictions (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="10">
                    <bold>AphasiaBank</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">80</td>
                  <td colspan="2">60</td>
                  <td colspan="2">0</td>
                  <td colspan="2">0</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">20</td>
                  <td colspan="2">60</td>
                  <td colspan="2">100</td>
                  <td colspan="2">80</td>
                </tr>
                <tr valign="top">
                  <td colspan="10">
                    <bold>ASDBank</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">20</td>
                  <td colspan="2">100</td>
                  <td colspan="2">0</td>
                  <td colspan="2">80</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">100</td>
                  <td colspan="2">100</td>
                  <td colspan="2">0</td>
                  <td colspan="2">―<sup>a</sup></td>
                </tr>
                <tr valign="top">
                  <td colspan="10">
                    <bold>DAIC-WOZ<sup>b</sup> database</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">40</td>
                  <td colspan="2">80</td>
                  <td colspan="2">0</td>
                  <td colspan="2">20</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">20</td>
                  <td colspan="2">100</td>
                  <td colspan="2">20</td>
                  <td colspan="2">0</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table9fn1">
                <p><sup>a</sup>Not applicable.</p>
              </fn>
              <fn id="table9fn2">
                <p><sup>b</sup>DAIC-WOZ: Distress Analysis Interview Corpus-Wizard-of-Oz.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>For the code generation approach, we found some examples of text archetypes (ie, typical examples) that were frequently misclassified. These archetypes often reflect characteristics of the conditions. Common errors we observed are described in the following sections.</p>
        </sec>
        <sec>
          <title>Repetitive Language and Filler Words (Aphasia)</title>
          <p>The presence of repetitive language patterns and an increased frequency of filler words led to misclassification as a high proportion of false positives for aphasia. Control participants’ responses typically exhibited minimal repetition and filler word use. However, even a slight elevation in these linguistic elements frequently resulted in misclassification, with the chatbots erroneously classifying control participants as positives. Notably, misclassified false positives from almost all the chatbots contained these features.</p>
        </sec>
        <sec>
          <title>Fragmented Sentences and Filler Words (ASD)</title>
          <p>Transcripts containing filler words or fragmented sentences were misclassified in almost 100% of cases as false positives originating from individuals with ASD. With generative pretrained transformer models, this archetype was observed in most false-positive data points, indicating a consistent misclassification pattern. In contrast, Claude 3.5 exhibited a different trend because most misclassified points were false negatives. Claude 3.5 did not appear to excessively use the linguistic feature characteristic of this archetype.</p>
        </sec>
        <sec>
          <title>Lack of Depressive Language (Depression)</title>
          <p>Text lacking overt depressive indicators and conveying generally positive sentiments accounted for a large amount of false negatives. For instance, statements such as “uh I’d say maybe the fact that it’s a lot different than it was about ten years ago” and “I am pretty happy with the level of education I’ve gotten” often led to false negatives.</p>
        </sec>
        <sec>
          <title>Excessive Amount of Laughter (Depression)</title>
          <p>Texts containing instances of laughter were classified as false negatives in &#62;70% of cases originating from control participants rather than individuals with depression.</p>
        </sec>
        <sec>
          <title>Excessive Number of Sighs (Depression)</title>
          <p>Texts containing references to sighing were categorized as false positives originating from individuals with depression. Over 30% of false-positive cases included this feature, indicating its disproportionate influence on the classification process.</p>
        </sec>
        <sec>
          <title>Frequency of Occurrence of Archetypes</title>
          <p><xref ref-type="table" rid="table10">Table 10</xref> details the frequency of occurrence of these archetypes. The observed misclassifications highlight the inherent constraints of relying on text-based methods for neurobehavioral diagnosis.</p>
          <table-wrap position="float" id="table10">
            <label>Table 10</label>
            <caption>
              <p>Percentage of each text archetype in false-positive or false-negative data points in the code generation conditions averaged across folds.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="350"/>
              <col width="0"/>
              <col width="150"/>
              <col width="0"/>
              <col width="150"/>
              <col width="0"/>
              <col width="100"/>
              <col width="0"/>
              <col width="100"/>
              <col width="0"/>
              <col width="120"/>
              <thead>
                <tr valign="top">
                  <td colspan="3">Archetype and approach</td>
                  <td colspan="2">GPT-4 (%)</td>
                  <td colspan="2">GPT-4o (%)</td>
                  <td colspan="2">GPT-o3 (%)</td>
                  <td colspan="2">Claude 3.5 (%)</td>
                  <td>Gemini 2.5 Pro (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="12">
                    <bold>Repetitive language and filler words (false positives)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">100</td>
                  <td colspan="2">100</td>
                  <td colspan="2">100</td>
                  <td colspan="2">100</td>
                  <td colspan="2">90</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">100</td>
                  <td colspan="2">100</td>
                  <td colspan="2">0</td>
                  <td colspan="2">100</td>
                  <td colspan="2">85</td>
                </tr>
                <tr valign="top">
                  <td colspan="12">
                    <bold>Fragmented sentences and filler words (false positives)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">100</td>
                  <td colspan="2">100</td>
                  <td colspan="2">66.67</td>
                  <td colspan="2">0</td>
                  <td colspan="2">100</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">100</td>
                  <td colspan="2">100</td>
                  <td colspan="2">66.67</td>
                  <td colspan="2">0</td>
                  <td colspan="2">―<sup>a</sup></td>
                </tr>
                <tr valign="top">
                  <td colspan="12">
                    <bold>Lack of depressive language (false negatives)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">87.67</td>
                  <td colspan="2">89.02</td>
                  <td colspan="2">30</td>
                  <td colspan="2">85</td>
                  <td colspan="2">100</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">87.67</td>
                  <td colspan="2">89.09</td>
                  <td colspan="2">100</td>
                  <td colspan="2">79.09</td>
                  <td colspan="2">100</td>
                </tr>
                <tr valign="top">
                  <td colspan="12">
                    <bold>Excessive amount of laughter (false negatives)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">88.36</td>
                  <td colspan="2">88.34</td>
                  <td colspan="2">78</td>
                  <td colspan="2">90.7</td>
                  <td colspan="2">96.77</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">88.36</td>
                  <td colspan="2">88.9</td>
                  <td colspan="2">80.5</td>
                  <td colspan="2">90.7</td>
                  <td colspan="2">100</td>
                </tr>
                <tr valign="top">
                  <td colspan="12">
                    <bold>Excessive number of sighs (false positives)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Without assessment scale</td>
                  <td colspan="2">67.44</td>
                  <td colspan="2">69.23</td>
                  <td colspan="2">30.8</td>
                  <td colspan="2">52</td>
                  <td colspan="2">60.61</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>With assessment scale</td>
                  <td colspan="2">65.12</td>
                  <td colspan="2">74.19</td>
                  <td colspan="2">29</td>
                  <td colspan="2">52.17</td>
                  <td colspan="2">100</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table10fn1">
                <p><sup>a</sup>Not available.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study reveals the limitations of using LLMs for automated neurobehavioral classification. In both direct diagnosis conditions, we encountered significant limitations with these models, which tended to generate random or close-to-random predictions. The models occasionally refused to offer diagnoses, and when compelled to complete the tasks, the resulting classifications were not accurate. These challenges were even more pronounced with Claude 3.5 and Gemini 2.5 Pro, with which we faced difficulties generating any classification results or ratings in some conditions. The inclusion of assessment scales did not substantially improve performance as the ratings on scale items also appeared to be randomly assigned in most situations. Notably, in many of these conditions, we observed a concerning trend in which assessment scale ratings were often identical across participants regardless of individual differences in their text data.</p>
        <p>It is important to note that previous studies have successfully achieved <italic>F</italic><sub>1</sub>-scores of 70% to 80% using subsets of the ASDBank dataset and high performance (<italic>F</italic><sub>1</sub>-scores of 80%-90%) using various methods on at least portions of the other 2 datasets [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. In contrast, our results indicate that most direct diagnosis approaches and the code generated by these models were not able to attain similar results to those of previous studies. This discrepancy suggests a gap between the performance that ML models can potentially achieve and the outcomes observed in our study. This may be due to our relatively straightforward methodological approach.</p>
        <p>Regarding the code generation condition, our findings suggest that LLM-generated ML pipelines show promising potential for improving diagnostic performance. Notably, on the AphasiaBank dataset, ChatGPT with GPT-o3 produced code that outperformed results reported in previous studies, although the choice of learning algorithms sometimes varied across conditions and lacked a clear rationale.</p>
        <p>In the code generation condition using assessment scales, we observed that the code from the chatbots did not apply diagnostic thresholds as defined by the assessment scales but, instead, directly incorporated the ratings as ML features. The rating methods were simplistic, and the chatbots frequently implemented a keyword-counting algorithm to provide ratings for ASDBank and DAIC-WOZ. These ratings were then concatenated with features extracted from the feature extractor. This direct concatenation of features without sophisticated integration of diagnostic logic may explain why the assessment scale conditions did not lead to improved performance. More effective integration of these ratings in the generated code may help enhance future model performance.</p>
        <p>We also observed that models with built-in chain-of-thought reasoning capabilities such as ChatGPT with GPT-o3 and Gemini 2.5 Pro exhibited improved performance under certain conditions. For instance, in the code generation tasks on the AphasiaBank dataset, these chain-of-thought models consistently outperformed others. Permutation tests conducted on the test sets across 5 cross-validation folds revealed statistically significant differences between models that used chain-of-thought reasoning and those that did not (ChatGPT with GPT-4 vs Gemini 2.5 Pro: accuracy <italic>P=</italic>.01, <italic>F</italic><sub>1</sub>-score <italic>P=</italic>.03; ChatGPT with GPT-4 vs ChatGPT with GPT-o3: accuracy <italic>P&#60;</italic>.001, <italic>F</italic><sub>1</sub>-score <italic>P&#60;</italic>.001; ChatGPT with GPT-4o vs Gemini 2.5 Pro: accuracy <italic>P=</italic>.01, <italic>F</italic><sub>1</sub>-score <italic>P=</italic>.002; ChatGPT with GPT-4o vs ChatGPT with GPT-o3: accuracy <italic>P&#60;</italic>.001, <italic>F</italic><sub>1</sub>-score <italic>P&#60;</italic>.001). While this improvement was not observed across all datasets (ie, DAIC-WOZ and ASDBank), the integration of structured prompting strategies appears to be a promising direction for future research.</p>
        <p>In previous studies, human-in-the-loop processes have demonstrated promise for diagnostic classification tasks [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. However, in such approaches, the human must remain more involved in the computational diagnosis procedure than simply prompting the LLM to generate a direct diagnosis, clinical rating, or classification code. In prior work for autism diagnostics, for example, humans have extracted the behavioral features—a task that requires the ability to interpret relatively subjective human behavior—leaving the ML models to perform the simpler task of the final classification given the human-derived features [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. It is likely that humans performing at least some level of analysis of the data will need to continue to achieve clinically useful performance, and future prompt engineering approaches should explore these ideas more thoroughly.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>We acknowledge several limitations of this study beyond the observed performance gaps.</p>
        <p>First, the scope of our investigation was limited to 3 datasets, each representing a distinct neurobehavioral condition with relatively small sample sizes. This may constrain both the robustness and generalizability of our findings, as well as the models’ capacity to learn effectively.</p>
        <p>Second, another limitation lies in the selection and applicability of the clinical checklists used in the assessment scale approach. In many cases, the patient transcripts lacked sufficient information to reliably rate all items on the scales, potentially resulting in random or invalid scores. Future work may consider using longer or more comprehensive patient transcripts or choosing assessment tools that are more tolerant of limited inputs.</p>
        <p>Third, additional prompting strategies warrant exploration. While we observed performance gains from models that incorporated chain-of-thought reasoning by default, other prompting techniques may also enhance diagnostic accuracy.</p>
        <p>Finally, all input data were presented to the models at once in a single file. This may have hindered their ability to process the content effectively. Presenting the data incrementally one instance at a time could reduce noise and improve prediction consistency.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study demonstrates that popular LLM-based chatbots remain inadequate for classifying neurobehavioral conditions from text transcripts even when prompted to incorporate clinical assessment scales into their evaluation strategy. We recommend that future research further investigate the limitations identified in this study and examine whether incorporating structured tools—such as assessment scales—can serve as a viable method to improve diagnostic accuracy for neurobehavioral conditions when using more sophisticated prompting strategies.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Prompts for large language models.</p>
        <media xlink:href="ai_v4i1e75030_app1.docx" xlink:title="DOCX File , 19 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Code generated by GPT-o3 for ASDBank.</p>
        <media xlink:href="ai_v4i1e75030_app2.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ASD</term>
          <def>
            <p>autism spectrum disorder</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DAIC-WOZ</term>
          <def>
            <p>Distress Analysis Interview Corpus-Wizard-of-Oz</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors gratefully acknowledge the grant support for the AphasiaBank project from the National Institutes of Health National Institute on Deafness and Other Communication Disorders (grant R01-DC008524; 2022-2027). The authors appreciate the permission to use ASDBank and the DAIC-WOZ database. In addition, the authors recognize the contributions of the ChatGPT (OpenAI) and Claude (Anthropic) large language models in editing the grammar and phrasing of the manuscript and generating some of the icons used in the figures, although the authors have further edited the outputs of these models. We also thank the Institute of Linguistics, Academia Sinica, for providing the research environment and for supporting the publication of this work.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets analyzed during this study are available in the AphasiaBank [<xref ref-type="bibr" rid="ref27">27</xref>], ASDBank [<xref ref-type="bibr" rid="ref26">26</xref>], and DAIC-WOZ [<xref ref-type="bibr" rid="ref29">29</xref>] repositories.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>In addition to PW, KL also serves as a corresponding author (limkhaiin@as.edu.tw).</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mathew</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Is artificial intelligence a world changer? A case study of OpenAI’s Chat GPT</article-title>
          <source>Recent Progr Sci Technol</source>
          <year>2023</year>
          <volume>5</volume>
          <fpage>35</fpage>
          <lpage>42</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://stm.bookpi.org/RPST-V5/article/view/9718"/>
          </comment>
          <pub-id pub-id-type="doi">10.9734/bpi/rpst/v5/18240d</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Enabling large language models to generate text with citations</article-title>
          <source>ArXi</source>
          <comment> Preprint posted online on May 24, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2305.14627"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.398</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Biswas</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Potential use of Chat GPT in global warming</article-title>
          <source>Ann Biomed Eng</source>
          <year>2023</year>
          <month>06</month>
          <volume>51</volume>
          <issue>6</issue>
          <fpage>1126</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1007/s10439-023-03171-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03171-8</pub-id>
          <pub-id pub-id-type="medline">36856927</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03171-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>YK</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bae</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hahn</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Chain of empathy: enhancing empathetic response of large language models based on psychotherapy models</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on November 2, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2311.04915"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Psy-LLM: scaling up global mental health psychological services with ai-based large language models</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on July 22, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2307.11991"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Detection of multiple mental disorders from social media with two-stream psychiatric experts</article-title>
          <source>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2023</year>
          <conf-name>EMNLP 2023</conf-name>
          <conf-date>December 6-10, 2023</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.562</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Empowering psychotherapy with large language models: cognitive distortion detection through diagnosis of thought prompting</article-title>
          <source>Proceedings of the Association for Computational Linguistics</source>
          <year>2023</year>
          <conf-name>EMNLP 2023</conf-name>
          <conf-date>December 6-10, 2023</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2310.07146"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.284</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhaumik</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Srivastava</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jalali</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chandrasekharan</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>MindWatch: a smart cloud-based AI solution for suicide ideation detection leveraging large language models</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online on September 26, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2023.09.25.23296062v1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2023.09.25.23296062</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garg</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Urs</surname>
              <given-names>VL</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Chaudhary</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Paliwal</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kar</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>Exploring the role of ChatGPT in patient care (diagnosis and treatment) and medical research: a systematic review</article-title>
          <source>Health Promot Perspect</source>
          <year>2023</year>
          <month>09</month>
          <day>11</day>
          <volume>13</volume>
          <issue>3</issue>
          <fpage>183</fpage>
          <lpage>91</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37808939"/>
          </comment>
          <pub-id pub-id-type="doi">10.34172/hpp.2023.22</pub-id>
          <pub-id pub-id-type="medline">37808939</pub-id>
          <pub-id pub-id-type="pmcid">PMC10558973</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dergaa</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Fekih-Romdhane</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hallit</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Loch</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Glenn</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Fessi</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Ben Aissa</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Souissi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Guelmami</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Swed</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>El Omri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bragazzi</surname>
              <given-names>NL</given-names>
            </name>
            <name name-style="western">
              <surname>Ben Saad</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT is not ready yet for use in providing mental health assessment and interventions</article-title>
          <source>Front Psychiatry</source>
          <year>2023</year>
          <volume>14</volume>
          <fpage>1277756</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38239905"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fpsyt.2023.1277756</pub-id>
          <pub-id pub-id-type="medline">38239905</pub-id>
          <pub-id pub-id-type="pmcid">PMC10794665</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Cuthbert</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis-Fernández</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Narrow</surname>
              <given-names>WE</given-names>
            </name>
            <name name-style="western">
              <surname>Reed</surname>
              <given-names>GM</given-names>
            </name>
          </person-group>
          <article-title>Three approaches to understanding and classifying mental disorder: ICD-11, DSM-5, and the National Institute of Mental Health's Research Domain Criteria (RDoC)</article-title>
          <source>Psychol Sci Public Interest</source>
          <year>2017</year>
          <month>09</month>
          <volume>18</volume>
          <issue>2</issue>
          <fpage>72</fpage>
          <lpage>145</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1177/1529100617727266"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/1529100617727266</pub-id>
          <pub-id pub-id-type="medline">29211974</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="book">
          <source>Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition</source>
          <year>2013</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>American Psychiatric Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bhushan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Machine learning and deep learning approach for medical image analysis: diagnosis to detection</article-title>
          <source>Multimed Tools Appl</source>
          <year>2022</year>
          <month>12</month>
          <day>24</day>
          <volume>82</volume>
          <issue>17</issue>
          <fpage>1</fpage>
          <lpage>39</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36588765"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11042-022-14305-w</pub-id>
          <pub-id pub-id-type="medline">36588765</pub-id>
          <pub-id pub-id-type="pii">14305</pub-id>
          <pub-id pub-id-type="pmcid">PMC9788870</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Iyortsuun</surname>
              <given-names>NK</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Jhon</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Pant</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A review of machine learning and deep learning approaches on mental health diagnosis</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <month>01</month>
          <day>17</day>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>285</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11030285"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11030285</pub-id>
          <pub-id pub-id-type="medline">36766860</pub-id>
          <pub-id pub-id-type="pii">healthcare11030285</pub-id>
          <pub-id pub-id-type="pmcid">PMC9914523</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Athaluri</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Manthena</surname>
              <given-names>SV</given-names>
            </name>
            <name name-style="western">
              <surname>Kesapragada</surname>
              <given-names>VS</given-names>
            </name>
            <name name-style="western">
              <surname>Yarlagadda</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Dave</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Duddumpudi</surname>
              <given-names>RT</given-names>
            </name>
          </person-group>
          <article-title>Exploring the boundaries of reality: investigating the phenomenon of artificial intelligence hallucination in scientific writing through ChatGPT references</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <month>04</month>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>e37432</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37182055"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.37432</pub-id>
          <pub-id pub-id-type="medline">37182055</pub-id>
          <pub-id pub-id-type="pmcid">PMC10173677</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Frieske</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ishii</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bang</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Madotto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Survey of hallucination in natural language generation</article-title>
          <source>ACM Comput Surv</source>
          <year>2023</year>
          <month>03</month>
          <day>03</day>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>1</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1145/3571730</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>PP</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: a comprehensive review on background, applications, key challenges, bias, ethics, limitations and future scope</article-title>
          <source>Internet Things Cyber Phys Syst</source>
          <year>2023</year>
          <volume>3</volume>
          <fpage>121</fpage>
          <lpage>54</lpage>
          <pub-id pub-id-type="doi">10.1016/j.iotcps.2023.04.003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rasool</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shahzad</surname>
              <given-names>MI</given-names>
            </name>
            <name name-style="western">
              <surname>Aslam</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Emotion-aware response generation using affect-enriched embeddings with LLMs</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on October 02, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/384599486_Emotion-Aware_Response_Generation_Using_Affect-Enriched_Embeddings_with_LLMs"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>ChatGPT homepage</article-title>
          <source>ChatGPT</source>
          <access-date>2025-09-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://chat.openai.com/">https://chat.openai.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Gemini Team Google</collab>
          </person-group>
          <article-title>Gemini: a family of highly capable multimodal models</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on December 19, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2312.11805"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <article-title>Claude homepage</article-title>
          <source>Claude</source>
          <access-date>2025-09-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://claude.ai/login?returnTo=%2F%3F">https://claude.ai/login?returnTo=%2F%3F</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Cambria</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>BW</given-names>
            </name>
          </person-group>
          <article-title>Will affective computing emerge from foundation models and general artificial intelligence? A first evaluation of ChatGPT</article-title>
          <source>IEEE Intell Syst</source>
          <year>2023</year>
          <month>3</month>
          <volume>38</volume>
          <issue>2</issue>
          <fpage>15</fpage>
          <lpage>23</lpage>
          <pub-id pub-id-type="doi">10.1109/mis.2023.3254179</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pandya</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lodha</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ganatra</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Is ChatGPT ready to change mental healthcare? Challenges and considerations: a reality-check</article-title>
          <source>Front Hum Dyn</source>
          <year>2024</year>
          <month>1</month>
          <day>11</day>
          <volume>5</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.frontiersin.org/journals/human-dynamics/articles/10.3389/fhumd.2023.1289255/full"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fhumd.2023.1289255</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giadikiaroglou</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lymperaiou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Filandrianos</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Stamou</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Puzzle solving using reasoning of large language models: a survey</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 17, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2402.11291"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.646</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>WW</given-names>
            </name>
          </person-group>
          <article-title>Program of thoughts prompting: disentangling computation from reasoning for numerical reasoning tasks</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on November 22, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2211.12588"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>MacWhinney</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Understanding spoken language through TalkBank</article-title>
          <source>Behav Res Methods</source>
          <year>2019</year>
          <month>08</month>
          <day>3</day>
          <volume>51</volume>
          <issue>4</issue>
          <fpage>1919</fpage>
          <lpage>27</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30511153"/>
          </comment>
          <pub-id pub-id-type="doi">10.3758/s13428-018-1174-9</pub-id>
          <pub-id pub-id-type="medline">30511153</pub-id>
          <pub-id pub-id-type="pii">10.3758/s13428-018-1174-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC6546550</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Macwhinney</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Fromm</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Forbes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Holland</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>AphasiaBank: methods for studying discourse</article-title>
          <source>Aphasiology</source>
          <year>2011</year>
          <month>09</month>
          <day>22</day>
          <volume>25</volume>
          <issue>11</issue>
          <fpage>1286</fpage>
          <lpage>307</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/22923879"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/02687038.2011.589893</pub-id>
          <pub-id pub-id-type="medline">22923879</pub-id>
          <pub-id pub-id-type="pmcid">PMC3424615</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <article-title>TalkBank homepage</article-title>
          <source>TalkBank</source>
          <access-date>2025-09-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://talkbank.org/">https://talkbank.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>DeVault</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Georgila</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Artstein</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Morbini</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Traum</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Scherer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rizzo</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Morency</surname>
              <given-names>LP</given-names>
            </name>
          </person-group>
          <article-title>Verbal indicators of psychological distress in interactive dialogue with a virtual human</article-title>
          <source>Proceedings of the SIGDIAL 2013 Conference</source>
          <year>2013</year>
          <conf-name>SIGDIAL 2013</conf-name>
          <conf-date>August 22-24, 2013</conf-date>
          <conf-loc>Metz, France</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bologna</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>What is aphasia?</article-title>
          <source>National Aphasia Association</source>
          <access-date>2025-09-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aphasia.org/what-is-aphasia/">https://aphasia.org/what-is-aphasia/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>American Psychiatric Association</collab>
          </person-group>
          <source>Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition, Text Revision (DSM-5-TR)</source>
          <year>2022</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>American Psychiatric Association</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dinkel</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Text-based depression detection on sparse data</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on April 8, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1904.05154"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mishra</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Enhancing depression detection in clinical interviews: integration of fused BERT-BiLSTM Model and XGBoost</article-title>
          <source>Proceedings of the 2024 10th International Conference on Computing and Artificial Intelligence</source>
          <year>2024</year>
          <conf-name>ICCAI '24</conf-name>
          <conf-date>April 26-29, 2024</conf-date>
          <conf-loc>Bali Island, Indonesia</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1145/3669754.3669780"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3669754.3669780</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>LaCroix</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Leveraging pre-trained large language models for aphasia detection in English and Chinese speakers</article-title>
          <source>Proceedings of the 6th Clinical Natural Language Processing Workshop</source>
          <year>2024</year>
          <conf-name>ClinicalNLP 2024</conf-name>
          <conf-date>June 20-21, 2024</conf-date>
          <conf-loc>Mexico City, Mexico</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.20</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Masud</surname>
              <given-names>JH</given-names>
            </name>
          </person-group>
          <article-title>Applying machine learning to language problem analysis</article-title>
          <source>Proceedings of the IEEE International Conference on Information Reuse and Integration for Data Science</source>
          <year>2024</year>
          <conf-name>IRI 2024</conf-name>
          <conf-date>August 7-9, 2024</conf-date>
          <conf-loc>San Jose, CA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/iri62200.2024.00050</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kertesz</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <source>Western Aphasia Battery–Revised</source>
          <year>2007</year>
          <publisher-loc>San Antonio, TX</publisher-loc>
          <publisher-name>The Psychological Corporation</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baron-Cohen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wheelwright</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Skinner</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Clubley</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>The autism-spectrum quotient (AQ): evidence from Asperger syndrome/high-functioning autism, males and females, scientists and mathematicians</article-title>
          <source>J Autism Dev Disord</source>
          <year>2001</year>
          <month>02</month>
          <volume>31</volume>
          <issue>1</issue>
          <fpage>5</fpage>
          <lpage>17</lpage>
          <pub-id pub-id-type="doi">10.1023/a:1005653411471</pub-id>
          <pub-id pub-id-type="medline">11439754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Burns</surname>
              <given-names>DD</given-names>
            </name>
          </person-group>
          <source>The Feeling Good Handbook</source>
          <year>1999</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Plume</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Washington</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>A perspective on crowdsourcing and human-in-the-loop workflows in precision health</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <month>04</month>
          <day>11</day>
          <volume>26</volume>
          <fpage>e51138</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e51138/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/51138</pub-id>
          <pub-id pub-id-type="medline">38602750</pub-id>
          <pub-id pub-id-type="pii">v26i1e51138</pub-id>
          <pub-id pub-id-type="pmcid">PMC11046386</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Washington</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wall</surname>
              <given-names>DP</given-names>
            </name>
          </person-group>
          <article-title>A review of and roadmap for data science and machine learning for the neuropsychiatric phenotype of autism</article-title>
          <source>Annu Rev Biomed Data Sci</source>
          <year>2023</year>
          <month>08</month>
          <day>10</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>211</fpage>
          <lpage>28</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.annualreviews.org/content/journals/10.1146/annurev-biodatasci-020722-125454?crawler=true&#38;mimetype=application/pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.1146/annurev-biodatasci-020722-125454</pub-id>
          <pub-id pub-id-type="medline">37137169</pub-id>
          <pub-id pub-id-type="pmcid">PMC11093217</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tariq</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Daniels</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Washington</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kalantarian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wall</surname>
              <given-names>DP</given-names>
            </name>
          </person-group>
          <article-title>Mobile detection of autism through machine learning on home video: a development and prospective validation study</article-title>
          <source>PLoS Med</source>
          <year>2018</year>
          <month>11</month>
          <volume>15</volume>
          <issue>11</issue>
          <fpage>e1002705</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pmed.1002705"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pmed.1002705</pub-id>
          <pub-id pub-id-type="medline">30481180</pub-id>
          <pub-id pub-id-type="pii">PMEDICINE-D-18-01991</pub-id>
          <pub-id pub-id-type="pmcid">PMC6258501</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Washington</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tariq</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Leblanc</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Chrisman</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dunlap</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kline</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kalantarian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Penev</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Paskov</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Voss</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Stockham</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Varma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Husic</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kent</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Haber</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Winograd</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wall</surname>
              <given-names>DP</given-names>
            </name>
          </person-group>
          <article-title>Crowdsourced privacy-preserved feature tagging of short home videos for machine learning ASD detection</article-title>
          <source>Sci Rep</source>
          <year>2021</year>
          <month>04</month>
          <day>07</day>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>7620</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-021-87059-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-021-87059-4</pub-id>
          <pub-id pub-id-type="medline">33828118</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-021-87059-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC8027393</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
