<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e83927</article-id><article-id pub-id-type="doi">10.2196/83927</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Simulated Reasoning and Self-Verification for Psychiatric Diagnosis in Generalist Large Language Models: Comparative Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Sarma</surname><given-names>Karthik V</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hanss</surname><given-names>Kaitlin E</given-names></name><degrees>MD, MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Halls</surname><given-names>Andrew J M</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Becker</surname><given-names>Daniel F</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Glowinski</surname><given-names>Anne L</given-names></name><degrees>MD, MPE</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Krystal</surname><given-names>Andrew D</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Psychiatry and Behavioral Sciences, University of California, San Francisco</institution><addr-line>675 18th Street, Box 3134</addr-line><addr-line>San Francisco</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Philip R Lee Institute for Health Policy Studies, University of California, San Francisco</institution><addr-line>San Francisco</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Malin</surname><given-names>Bradley</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Rasool</surname><given-names>Abdur</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Paulus</surname><given-names>Martin P</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Karthik V Sarma, MD, PhD, Department of Psychiatry and Behavioral Sciences, University of California, San Francisco, 675 18th Street, Box 3134, San Francisco, CA, 94107, United States, 1 415-476-7527; <email>Karthik.Sarma@ucsf.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>8</day><month>6</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e83927</elocation-id><history><date date-type="received"><day>15</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>24</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>03</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Karthik V Sarma, Kaitlin E Hanss, Andrew J M Halls, Daniel F Becker, Anne L Glowinski, Andrew D Krystal. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 8.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e83927"/><abstract><sec><title>Background</title><p>Large language models (LLMs) and, more recently, large reasoning models (LRMs) have rapidly garnered significant interest for application in psychiatry and behavioral health. However, recent studies have identified significant shortcomings and potential risks in the performance of LLM-based systems, complicating their application to psychiatric diagnosis. Two promising approaches to addressing these challenges and improving the efficacy of these models are simulated reasoning (SR) and self-verification (SV), in which additional &#x201C;reasoning tokens&#x201D; are used to guide model output, either during or after inference.</p></sec><sec><title>Objective</title><p>We aimed to explore how the use of SR (via LRMs) and SV (via supplemental prompting) affects the psychiatric diagnostic performance of LLMs.</p></sec><sec sec-type="methods"><title>Methods</title><p>106 case vignettes and associated diagnoses were extracted from the DSM-5-TR (Diagnostic and Statistical Manual, Version 5, Text Revision) Clinical Cases book, with permission. Both an LLM and an LRM model were selected from the latest available model generation for each of the two vendors studied (OpenAI and Google). Two inference approaches were developed: a Basic approach that directly prompted models to provide diagnoses and a SV approach that augmented the Basic approach with additional prompts. All case vignettes were processed by the two LLMs, two LRMs, and two inference approaches, and diagnostic performance was evaluated using the sensitivity and positive predictive value (PPV). Binomial generalized linear mixed models were used to test for significant differences between the model vendors (OpenAI, Google), type (LLM, LRM), and the addition of an SV prompt.</p></sec><sec sec-type="results"><title>Results</title><p>All vignettes were successfully processed by each model and inference approach. Sensitivity ranged from 0.732 to 0.817, and PPV ranged from 0.534 to 0.779. The best overall performance was found in the <italic>o3-pro</italic> LRM using SV, with a sensitivity of 0.782 and a PPV of 0.779. No statistically significant fixed effects were found for sensitivity. For PPV, a statistically significant effect was found for prompt type (SV, <italic>P</italic>=.007) and model type (LRM, <italic>P</italic>=.009). No significant interaction effects were identified.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>We found that both SR and SV yielded statistically significant improvements in the PPV, without significant differences in the sensitivity. The addition of the manually specified SV prompt improved the PPV even when simulated reasoning was used. This suggests that future efforts to apply language models in behavioral health could benefit from manually crafted reasoning prompts and automated SR.</p></sec></abstract><kwd-group><kwd>psychiatry</kwd><kwd>diagnosis</kwd><kwd>large language models</kwd><kwd>large reasoning models</kwd><kwd>reasoning</kwd><kwd>simulated reasoning</kwd><kwd>self verification</kwd><kwd>DSM</kwd><kwd>Diagnostic and Statistical Manual</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) have rapidly garnered significant interest for application in psychiatry and behavioral health. LLMs, a type of deep artificial intelligence (AI) system, are trained as &#x201C;next token&#x201D; (ie, next word) predictors on very large-scale unsupervised text corpora, with estimates in the trillions of tokens [<xref ref-type="bibr" rid="ref1">1</xref>]. These models have demonstrated unexpected and remarkable emergent capabilities in the areas of information retrieval and processing and problem-solving, suggestive of a capability to engage with the semantic content of unstructured text not possible with previous technologies.</p><p>These emergent capabilities have enabled the demonstration of both clinical and consumer-facing LLM-based systems with a broad set of applications across fields of medicine. In behavioral health, projects have demonstrated their use in automated diagnostic and therapeutic interactive agents (ie, chatbots) [<xref ref-type="bibr" rid="ref2">2</xref>], question-answering [<xref ref-type="bibr" rid="ref3">3</xref>], diagnostic reasoning [<xref ref-type="bibr" rid="ref4">4</xref>], and analysis of psychotherapy transcripts [<xref ref-type="bibr" rid="ref5">5</xref>]. However, recent studies have also identified significant shortcomings and potential risks in the performance of LLM-based systems, including hallucination, bias, sycophancy, and the generation of associated misaligned and dangerous content for users experiencing mental health challenges [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. These challenges complicate the application of large language models to psychiatric diagnosis, especially in the context of the variability in interrater reliability of DSM diagnoses [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Though some published work has noted the efficacy of structured prompt-based decision processes [<xref ref-type="bibr" rid="ref4">4</xref>], other works have found that structured behavioral health prompts, such as psychometric scales, may not work effectively with generalist models [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Two promising approaches to addressing these challenges and improving the efficacy of these models are simulated reasoning (SR) and self-verification (SV). In these methods, a structured approach is used to generate additional tokens (known as <italic>reasoning tokens</italic>), either generated by the model or provided by the system designer, that increase the probability of a correct response. In SR, tokens are generated to create a reasoning &#x201C;chain of thought&#x201D; that leads to the model&#x2019;s final response. In SV, once a preliminary response is generated by the model, reasoning tokens are added to lead the model to re-evaluate the correctness of this answer before generating a final result. LLMs that automatically incorporate the use of reasoning tokens are often known as large reasoning models (LRMs), and the major vendors of LLMs have now produced LRM versions of their latest-generation tools (ie, OpenAI o3, Google Gemini 2.5 Pro). Some recent works, however, have suggested that the use of reasoning tokens in LRM may not lead to reasoning-like processes [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>] in some settings. For example, in one evaluation by a major vendor, researchers found that reasoning tokens output by an LRM frequently did not represent actual reasoning processes, as evidenced by their failure to note direct hints that were provided in initial prompts [<xref ref-type="bibr" rid="ref14">14</xref>]. In contrast, a large vendor recently stated that they plan to automatically move conversations that contain acute distress into reasoning models for &#x201C;more helpful and beneficial responses&#x201D; in these circumstances [<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>Though the exploration of LLM-based tools for clinical application continues to progress rapidly, few studies are available to guide the choice of model or reasoning approach specifically for behavioral health, or to provide an estimation of how these models will perform in this setting. There is an expanding body of evidence on the use of these tools across other fields of medicine, with varied findings. In one recent paper, a selection of LLMs and LRMs were tasked with clinical reasoning using vignettes across domains of medicine; this work found high accuracy on final diagnosis but poor performance in other reasoning stages [<xref ref-type="bibr" rid="ref17">17</xref>]; another study similarly attempted to assess diagnostic reasoning across domains of medicine using script concordance testing and found sub-clinician reasoning performance [<xref ref-type="bibr" rid="ref18">18</xref>]. Neither study specifically investigated the differential impact of simulated reasoning on outcomes. Several efforts have worked towards developing benchmarks to enable serial evaluation of performance, without a focus on behavioral health [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>In prior work, we demonstrated that fixed prompt decision trees improve LLM psychiatric diagnostic performance [<xref ref-type="bibr" rid="ref4">4</xref>] in a vignette dataset, without assessing LRMs. A recent unpublished preprint similarly used a vignette dataset to assess diagnostic performance, making use of state-of-the-art LRMs and finding moderate to high diagnostic accuracy, without specifically investigating the differential impact of methods like SR or SV. Another study examined the use of LLMs with real-world EHR data, finding promising but variable diagnostic performance, without investigating methodological factors [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>To address the evidence gap in understanding the impact of differing reasoning methods for behavioral health, we aim here to evaluate whether the use of SR (ie, LRMs) and/or SV (ie, standardized post-processing) enhances LLM performance on a vignette-based diagnostic task.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>Institutional Review Board submission and approval were not required for this study because it contains no human subjects research.</p></sec><sec id="s2-2"><title>Standardized Task and Dataset</title><p>We extracted 106 case vignettes written by 133 discussants across 19 primary DSM-5-TR (Diagnostic and Statistical Manual, Version 5, Text Revision) diagnostic categories from the DSM-5-TR Clinical Cases book. Each vignette is a multi-paragraph description of a particular case; the vignettes are written for educational purposes; the author&#x2019;s intended final diagnoses are also provided separately from the vignette. The standardized task was to generate diagnoses from the case vignette, with the author-designated diagnoses used as the gold-standard comparison.</p><sec id="s2-2-1"><title>Memorization Analysis</title><p>To assess the potential impact of the models&#x2019; possible prior knowledge of these vignettes, we conducted a two-part memorization analysis. For each of the 106 vignettes, we constructed two truncation points: an early truncation (after sentence 3) and a midpoint truncation (at the halfway point of the vignette by sentence count). In separate queries for each truncation point, the model was provided with vignette text up to that point and prompted to provide the following exact sentences or, if unable, to write a plausible continuation. The exact prompt was as follows:</p><disp-quote><p>The following is the beginning of a passage of text. Please continue the passage by writing exactly the next 2&#x2010;3 sentences as they appear in the original source. If you do not recognize the text, write a plausible continuation.</p><attrib>&#x003C;TEXT&#x003E;</attrib></disp-quote><p>This was performed independently for all of the LLMs and LRMs used in the study. Three similarity metrics were computed for each model-generated continuation by comparison to the vignette: ROUGE-L [<xref ref-type="bibr" rid="ref22">22</xref>], BLEU-4 [<xref ref-type="bibr" rid="ref23">23</xref>], and exact sentence match.</p></sec></sec><sec id="s2-3"><title>Model Selection and Configuration</title><p>For this study, we selected one LLM and one LRM from two major vendors from whom an enterprise-grade LLM application programming interface (API) was available, and which had agreed terms of use that were (1) compatible with the intellectual property rights of the study researchers and rightsholders associated with the study data, and (2) prohibited the use of study data for model training by vendors. The vendors and models used for this study were: OpenAI (LLM: <italic>gpt-4.1-2025-04-14</italic>, LRM: <italic>o3-pro-2025-06-10</italic>), and Google (LLM: <italic>Gemini 2.5 Flash</italic>, LRM: <italic>Gemini 2.5 Pro</italic>). The LLMs were used at their default settings, with an output token limit of 2000. LRMs were provided with 16,000 reasoning tokens, and OpenAI models were also set to medium reasoning effort (Google models do not provide this parameter). Reasoning was disabled for both LLMs. Safety settings were disabled or set to the minimum level of filtering. Models were constrained to respond in a specified machine-parsable output format (JavaScript Object Notation, or JSON) either through the use of application programming interface (API) flags or by reprompting until the output was in valid JSON. All prompts and reprompts were independent without inclusion of prior input, output, or logs.</p></sec><sec id="s2-4"><title>Inference Approaches</title><p>We compared two inference approaches previously developed by the authors [<xref ref-type="bibr" rid="ref4">4</xref>] using each of the available models. In the first approach (the &#x201C;Basic&#x201D; approach), the model was directly prompted to assign diagnoses to the vignette using a standardized prompt (<xref ref-type="other" rid="box1">Textbox 1</xref>). In the second approach (the &#x201C;SV&#x201D; approach), diagnoses generated by the Basic approach were evaluated using a sequential pairwise elimination procedure. Each pair of candidate diagnoses was presented to the model using a standardized prompt asking whether both diagnoses were necessary or whether one was better explained by the other (<xref ref-type="other" rid="box1">Textbox 1</xref>); each of these prompts was done independently without including context from any other prompt. If both were retained, the procedure continued; if only one was selected, the other was removed from further evaluation. Because this procedure requires at least two candidates, vignettes for which the Basic approach produced a single diagnosis were not modified by SV, and if a candidate diagnosis list was reduced to only one candidate, the procedure was terminated. This approach serves as a templated unidirectional filter that can only narrow the list of diagnoses.</p><boxed-text id="box1"><title> Prompt templates for inference approaches.</title><p><underline><bold>Basic prompt</bold></underline></p><p><underline/></p><list list-type="simple"><list-item><p>I am going to give you an academic psychiatry clinical case that describes a patient with one or more psychiatric DSM-5-TR diagnoses, and you are going to answer questions about that case that I provide. The clinical case is as follows: &#x003C;&#x003C;X&#x003E;&#x003E;</p></list-item></list><p/><list list-type="simple"><list-item><p>Please provide me with a list of DSM-5-TR diagnoses, without specifiers or modifiers, that you believe apply to this patient based solely on the clinical case. Please format them as a JSON list titled &#x201C;diagnoses&#x201D; with one diagnosis per entry. Do not include any other text in your response. Do not include any incorrect, inappropriate, or candidate diagnoses. For example, if the diagnoses are &#x201C;Insomnia Disorder&#x201D; and &#x201C;Bipolar I Disorder&#x201D;, you would reply with: {&#x201C;diagnoses&#x201D;: [&#x201C;Insomnia Disorder&#x201D;, &#x201C;Bipolar I Disorder&#x201D;]}</p></list-item></list><p/><p/><p><underline><bold>Self-verification prompt</bold></underline></p><p/><p>The following two DSM-5-TR diagnoses are candidate diagnoses for this patient: &#x003C;&#x003C;X&#x003E;&#x003E; and and &#x003C;&#x003C;Y&#x003E;&#x003E; . We are interested in deciding if both diagnoses are necessary for the patient, or if one of these two diagnoses is better explained by the other. Please respond with a list of which of these two diagnoses are necessary for the patient in JSON format. For example, if the candidate diagnoses are &#x201C;major depressive disorder&#x201D; and &#x201C;adjustment disorder&#x201D;, respond with [&#x201C;major depressive disorder&#x201D;, &#x201C;adjustment disorder&#x201D;] if both diagnoses are necessary, or with either [&#x201C;major depressive disorder&#x201D;] or [&#x201C;adjustment disorder&#x201D;] if one diagnosis is better than the other.</p></boxed-text></sec><sec id="s2-5"><title>Scoring and Analysis</title><sec id="s2-5-1"><title>Diagnosis Matching, Simplification, Scoring, and Metrics</title><p>To facilitate ease of comparison of model-generated and author-designated diagnoses, a previously developed semi-automated standardized diagnosis matching and simplification system was applied to all diagnoses. In this system, all specifiers and modifiers were removed, neurocognitive disorders were collapsed into either a catch-all diagnosis for delirium or a catch-all diagnosis for major or minor neurocognitive disorders, and then diagnoses were systematically matched to DSM-5-TR diagnoses, Z codes, or other categories. See <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for full details on this matching system. A model-generated diagnosis was scored as a true positive (TP) if it matched an author-designated diagnosis, or as a false positive (FP) otherwise; author-designated diagnoses without a matching model prediction were scored as false negatives (FN). Per-vignette sensitivity was calculated as TP/(TP+ FN) and PPV as TP/(TP+ FP).</p></sec><sec id="s2-5-2"><title>Statistical Analysis</title><p>Sensitivity and PPV were averaged across vignettes for reporting, and 95% confidence intervals were obtained by bootstrap resampling with replacement (10,000 iterations). During bootstrap computation, shared indices were used across all experiments to ensure consistency. Macro-averages were computed for reporting, giving equal weight to each vignette, to align with statistical modeling. Micro-averages were also computed to assess for consistency. For statistical comparison, we fit count-level (ie, TP, FP, FN) binomial generalized linear mixed models (GLMMs) to sensitivity and PPV, using a logit link and random intercept in <italic>R</italic>, to examine the fixed effects of model vendor (OpenAI, Google), type (LLM, LRM) and use of the SV prompt; two- and three-way interactions were also examined. A threshold of <italic>P</italic>&#x003C;.05 was used to evaluate for significant fixed effects.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>All 106 vignettes were processed by the four models, without any parsing errors or context window length overruns that would have required reprompting. The memorization analysis was reassuring against memorization; no model produced any exact sentence match and ROUGE-L and BLEU-4 scores demonstrated minimal overlap (<xref ref-type="table" rid="table1">Table 1</xref>). There were between 0 and 5 author-designated diagnoses per vignette distributed as: 0 (0.9%), 1 (58.5%), 2 (30.2%), 3 (7.5%), 4 (1.9%), and 5 (0.9%); the average number per vignette was 1.54. Two cases were not processed by the Google models due to a content filter block; both cases were treated as if they had zero predicted diagnoses.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Memorization analysis by model.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Vendor</td><td align="left" valign="bottom">Model type</td><td align="left" valign="bottom">ROUGE-L (mean, SD)</td><td align="left" valign="bottom">BLEU-4 (mean, SD)</td></tr></thead><tbody><tr><td align="left" valign="top">OpenAI</td><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">0.154 (0.040)</td><td align="left" valign="top">0.013 (0.012)</td></tr><tr><td align="left" valign="top">Google</td><td align="left" valign="top">LLM</td><td align="left" valign="top">0.153 (0.047)</td><td align="left" valign="top">0.013 (0.019)</td></tr><tr><td align="left" valign="top">OpenAI</td><td align="left" valign="top">LRM<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">0.106 (0.047)</td><td align="left" valign="top">0.007 (0.011)</td></tr><tr><td align="left" valign="top">Google</td><td align="left" valign="top">LRM</td><td align="left" valign="top">0.153 (0.044)</td><td align="left" valign="top">0.013 (0.014)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table1fn2"><p><sup>b</sup>LRM: large reasoning model. </p></fn></table-wrap-foot></table-wrap><p>Simplification and matching were completed, and all model-predicted diagnoses were matched to a DSM-5-TR diagnosis, Z code, or a non-psychiatric diagnosis. There was one case (7.2.1) with zero author-designated diagnoses (ie, no diagnosis was applicable); in this case, the sensitivity was undefined and was set as 0 for all models as all models incorrectly generated at least one false positive diagnosis for this case. There were no cases in which the PPV was undefined (ie, the model generated no diagnoses). Sensitivity ranged from 0.732 to 0.817 and PPV ranged from 0.534 to 0.779. Macro-averaged values for sensitivity and PPV for each experiment are available in <xref ref-type="table" rid="table2">Table 2</xref>; micro-averaged values were computed and found to provide qualitatively similar results. Total counts across each category for each experiment are available in <xref ref-type="table" rid="table3">Table 3</xref>. The Basic approach generated an average of 1.73 to 2.42 diagnoses per vignette across all four models, which SV reduced to 1.57 to 1.89. Per-vignette performance is depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>, and individual vignette-level results are available in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>Binomial GLMMs were successfully fit. The intraclass correlation (ICC) was 0.77 for sensitivity and 0.49 for PPV. No statistically significant fixed effects were found for sensitivity. For PPV, a statistically significant effect was found for prompt type (SV, <italic>P</italic>=.007), model type (LRM, <italic>P</italic>=.009). No statistically significant interactions were identified.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>LLM diagnostic performance metrics by experiment.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Vendor</td><td align="left" valign="bottom">Model type</td><td align="left" valign="bottom">Inference approach</td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">PPV<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" rowspan="4">Google</td><td align="left" valign="top" rowspan="2">LLM<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Basic</td><td align="left" valign="top">0.77 [0.68&#x2010;0.84]</td><td align="left" valign="top">0.53 [0.47&#x2010;0.60]</td></tr><tr><td align="left" valign="top">SV<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.75 [0.68&#x2010;0.82]</td><td align="left" valign="top">0.63 [0.56&#x2010;0.70]</td></tr><tr><td align="left" valign="top" rowspan="2">LRM<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">Basic</td><td align="left" valign="top">0.79 [0.72&#x2010;0.86]</td><td align="left" valign="top">0.63 [0.56&#x2010;0.69]</td></tr><tr><td align="left" valign="top">SV</td><td align="left" valign="top">0.77 [0.70&#x2010;0.84]</td><td align="left" valign="top">0.67 [0.60&#x2010;0.74]</td></tr><tr><td align="left" valign="top" rowspan="4">OpenAI</td><td align="left" valign="top" rowspan="2">LLM</td><td align="left" valign="top">Basic</td><td align="left" valign="top">0.75 [0.67&#x2010;0.82]</td><td align="left" valign="top">0.57 [0.50&#x2010;0.64]</td></tr><tr><td align="left" valign="top">SV</td><td align="left" valign="top">0.73 [0.66&#x2010;0.80]</td><td align="left" valign="top">0.64 [0.57&#x2010;0.71]</td></tr><tr><td align="left" valign="top" rowspan="2">LRM</td><td align="left" valign="top">Basic</td><td align="left" valign="top">0.82 [0.75&#x2010;0.88]</td><td align="left" valign="top">0.76 [0.69&#x2010;0.83]</td></tr><tr><td align="left" valign="top">SV</td><td align="left" valign="top">0.78 [0.71&#x2010;0.85]</td><td align="left" valign="top">0.78 [0.71&#x2010;0.85]</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>PPV: positive predictive value. </p></fn><fn id="table2fn2"><p><sup>b</sup>LLM: large language model.</p></fn><fn id="table2fn3"><p><sup>c</sup>SV: self-verification.</p></fn><fn id="table2fn4"><p><sup>d</sup>LRM: large reasoning model.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Diagnosis statistics by experiment.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Vendor</td><td align="left" valign="bottom">Model type</td><td align="left" valign="bottom">Inference approach</td><td align="left" valign="bottom">Author Dx<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">Model Dx</td><td align="left" valign="bottom">TP<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">FP<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="bottom">FN<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" rowspan="4">Google</td><td align="left" valign="top" rowspan="2">LLM<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">Basic</td><td align="left" valign="top">163</td><td align="left" valign="top">257</td><td align="left" valign="top">124</td><td align="left" valign="top">133</td><td align="left" valign="top">39</td></tr><tr><td align="left" valign="top">SV<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">163</td><td align="left" valign="top">200</td><td align="left" valign="top">121</td><td align="left" valign="top">79</td><td align="left" valign="top">42</td></tr><tr><td align="left" valign="top" rowspan="2">LRM<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="top">Basic</td><td align="left" valign="top">163</td><td align="left" valign="top">213</td><td align="left" valign="top">127</td><td align="left" valign="top">86</td><td align="left" valign="top">36</td></tr><tr><td align="left" valign="top">SV</td><td align="left" valign="top">163</td><td align="left" valign="top">188</td><td align="left" valign="top">121</td><td align="left" valign="top">67</td><td align="left" valign="top">42</td></tr><tr><td align="left" valign="top" rowspan="4">OpenAI</td><td align="left" valign="top" rowspan="2">LLM</td><td align="left" valign="top">Basic</td><td align="left" valign="top">163</td><td align="left" valign="top">233</td><td align="left" valign="top">125</td><td align="left" valign="top">108</td><td align="left" valign="top">38</td></tr><tr><td align="left" valign="top">SV</td><td align="left" valign="top">163</td><td align="left" valign="top">192</td><td align="left" valign="top">120</td><td align="left" valign="top">72</td><td align="left" valign="top">43</td></tr><tr><td align="left" valign="top" rowspan="2">LRM</td><td align="left" valign="top">Basic</td><td align="left" valign="top">163</td><td align="left" valign="top">183</td><td align="left" valign="top">135</td><td align="left" valign="top">48</td><td align="left" valign="top">28</td></tr><tr><td align="left" valign="top">SV</td><td align="left" valign="top">163</td><td align="left" valign="top">166</td><td align="left" valign="top">126</td><td align="left" valign="top">40</td><td align="left" valign="top">37</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Author Dx: author designated diagnoses.</p></fn><fn id="table3fn2"><p><sup>b</sup>TP: true positive.</p></fn><fn id="table3fn3"><p><sup>c</sup>FP: false positive.</p></fn><fn id="table3fn4"><p><sup>d</sup>FN: false negative.</p></fn><fn id="table3fn5"><p><sup>e</sup>LLM: large language model.</p></fn><fn id="table3fn6"><p><sup>f</sup>SV: self-verification.</p></fn><fn id="table3fn7"><p><sup>g</sup>LRM: large reasoning model.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Per-case results of evaluation of diagnostic performance, by experiment and diagnostic category. This color map displays the per-case sensitivity and positive predictive value (PPV) for diagnosis for every vignette, grouped by DSM-5-TR (Diagnostic and Statistical Manual, Version 5, Text Revision) category. Each row consists of experimental results in one specific category for one specific experiment (ie, combination of model and inference approach). A legend is displayed for the color map. Each column within each metric represents a single specific case vignette.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e83927_fig01.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>In this paper, we sought to evaluate the impact of SR and SV on the psychiatric diagnostic performance of language models from two different vendors.</p><sec id="s4-1"><title>Principal Findings</title><p>We successfully processed 106 psychiatric case vignettes using the selected approaches and found that both simulated reasoning and self-verification yielded statistically significant improvements in the PPV, compared to the Basic approach alone, without significant differences in the sensitivity. The best overall performance of models was found in the <italic>o3-pro</italic> LRM with the additional SV prompt, yielding a sensitivity of 0.782 and a PPV of 0.779. The sensitivity ICC of 0.77 suggests that much of the variance found in our experiments was explained by the intrinsic difficulty of the individual vignettes, a finding also clear in the per-vignette results depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>. In contrast, the PPV ICC of 0.49 indicates that the approach and the underlying vignette differences contributed approximately equal variance, which is consistent with the effects we found in PPV and the underlying mechanics of the SV method (since, for example, the SV prompt acts as a unidirectional filter, only removing diagnoses). The substantial case-level variability is consistent with past findings, such as the variation in inter-rater reliability across categories found in the DSM-5 field trials [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>Interestingly, we found no statistically significant interaction effect between SR and SV, suggesting that the use of both yielded no additional benefit over any one alone. This finding may, however, be due to a lack of sufficient power from our study to detect these differences and future evaluation is required. Thus, the addition of explicit reasoning suggestions in prompts (such as that in the SV prompt) could continue to be helpful even when using automated simulating reasoning systems. Similarly, we found no significant impact of the use of either approach on sensitivity; this is unsurprising for SV given that this approach can only remove diagnoses, but was surprising for SR which theoretically could use reasoning tokens to expand diagnostic range.</p></sec><sec id="s4-2"><title>Limitations</title><p>First, the semiautomated simplification and matching process we used was intended to give models the best opportunity to be scored as correct, and this may have the impact of overestimating the performance of the models. Second, the vignettes and diagnoses used for our experiment may be in the training set of these models (though they have not been licensed for such use); our test results are reassuring against memorization and the vignettes are explicitly not licensed for this use, but it is still possible that training exposure may have biased our evaluation towards better performance, constituting a contamination risk. Even in this case, however, we believe that the relative findings between methods in our study would be valuable. Third, since the vignettes were written for teaching, the prevalence of diagnoses is higher in the dataset than in the general population and more comprehensive information is provided in the vignettes than is generally available in the clinical setting, limiting the ecological validity of our sample. This may serve to inflate the performance metrics that we found and our results are not intended to suggest clinical utility at this stage. Additionally, the amplified illness prevalence in a vignette-based educational dataset enables methodological exploration, but findings may not generalize to low-prevalence real-world populations (such as self-diagnosing internet users). Fourth, not all diagnostic categories are equally or proportionally covered by our dataset, and our study was not powered to detect inter-category differences. Additionally, because we specifically collapsed neurocognitive disorders into two diagnoses (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), performance on these cases may be artificially inflated by masking potential challenges in differentiating between specific neurocognitive pathologies. Fifth, we have focused in our study on evaluating whether there is an improvement in the outcomes of the pipeline for analysis, but we did not directly evaluate the accuracy of the reasoning output generated by the LRMs, or the individual-level decisions made after SV prompts. Further, the actual reasoning output generated may not be causally linked to the final outcome [<xref ref-type="bibr" rid="ref14">14</xref>].</p></sec><sec id="s4-3"><title>Future Directions</title><p>Future efforts could make use of larger datasets, potentially including enhanced coverage of each diagnostic category and making use of either novel vignettes or real-world clinical data, considering full differential diagnosis, and comparison to human performance. This would enhance the power of future studies and minimize the risk of training bias. Additionally, future studies could examine the impact of model selection and prompt optimization to better elucidate how best to configure language models for psychiatric use, including the potential blending of SR and SV-like approaches, such as the use of pretemplated prompts that call for generating additional reasoning prompts. The addition of probabilistic confidence estimates would also be helpful, as real-world diagnostic processes require the generation of <italic>differential</italic> diagnoses; though estimates obtained by prompting are not reliable [<xref ref-type="bibr" rid="ref3">3</xref>], recent efforts have explored the use of confidence metrics like LLM perplexity for psychiatric reasoning [<xref ref-type="bibr" rid="ref24">24</xref>]. Finally, further studies could directly assess the accuracy and efficacy of the actual reasoning tokens generated by LRMs during inferencing, and whether there may be methods to improve the generated tokens.</p></sec><sec id="s4-4"><title>Conclusion</title><p>In this study, we found that both simulated reasoning and self-verification improved the positive predictive value of LLM-based psychiatric diagnosis from case vignettes, without significantly affecting sensitivity. The impact of these two approaches suggests that manually crafted verification prompts can continue to provide value even when automated reasoning capabilities are available. Future work should validate these findings using real-world clinical data in order to inform clinical system design.</p></sec></sec></body><back><ack><p>The authors acknowledge the mentorship and scientific vision provided by Dr. Atul J Butte, MD PhD, director of the UCSF Bakar Computational Health Sciences Institute, who passed away in June 2025. We thank the UCSF AI Tiger Team, UCSF Academic Research Services, UCSF Research Information Technology, and the UCSF Chancellor&#x2019;s Task Force for Generative AI for their support in developing LLM resources used for this project. Generative AI was used as described in the methods section for the execution of this research. It was not used in the production of the initial draft manuscript, but was used to assist in the development of revisions, including by reviewing and suggesting edits to address reviewer comments and generating candidate language for inclusion in the manuscript. All content was manually reviewed by a human author and the authors take responsibility for all content in this manuscript. The authors appreciated the opportunity to submit an earlier component of this work to the 2026 ACNP Annual Meeting. The contents of this manuscript are solely the responsibility of the authors and do not necessarily represent the official views of the NIH, APA, UCSF, or any other organization.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the National Institute of Mental Health of the National Institutes of Health [grant number R25 MH060482] and by the National Center for Advancing Translational Sciences, National Institutes of Health, through UCSF-CTSI Grant Number UL1 TR001872.</p></sec><sec><title>Data Availability</title><p>The dataset used for this study was extracted from the DSM-5-TR Clinical Cases textbook under license from the American Psychiatric Association (as follows), and cannot be reproduced by the authors of this study: This research was made possible through the use of content belonging to the American Psychiatric Association; express permission was obtained from the American Psychiatric Association for the use of such content (DSM-5-TR Clinical Cases. Copyright &#x00A9; 2023. American Psychiatric Association. All Rights Reserved, including rights for text and data mining (TDM), Artificial Intelligence (AI) training, and similar technologies).</p></sec></notes><fn-group><fn fn-type="con"><p>KVS: conceptualization, methodology, software, validation, formal analysis, investigation, resources, data curation, writing &#x2013; original draft, writing &#x2013; review &#x0026; editing, visualization, supervision, project administration. KEH: methodology, software, data curation, writing &#x2013; review &#x0026; editing. AJMH: methodology, writing &#x2013; review &#x0026; editing, supervision. DFB: methodology, writing &#x2013; review &#x0026; editing, supervision. ALG: methodology, writing &#x2013; review &#x0026; editing, supervision. AK: methodology, resources, writing &#x2013; review &#x0026; editing, supervision, funding acquisition.</p></fn><fn fn-type="conflict"><p>KVS holds publicly traded stock or options in Abbott Laboratories, Synergy Pharma, Pfizer, Procter &#x0026; Gamble, Colgate, 3M, and Solventum; private equity in SimX, OpenEvidence, and Orchard Neuro; and has received consulting fees, salary, travel reimbursement, or other support from SimX, the American Psychiatric Association, the California Medical Association, the American College of Neuropsychopharmacology, the US Department of Defense, Stanford University, Keio University, the Wellcome Learning Network, Gerson Lehman Group, Madison Industries, Amazon, OpenEvidence, and OpenAI. AK has received research grants from Alkermes, Janssen Pharmaceuticals, Axsome Therapeutics, Attune, Eisai, Harmony Biosciences, Neumora, Neurocrine Biosciences, the Ray and Dagmar Dolby Family Fund, the Weill Institute for Neurosciences, and the National Institutes of Health; consulting fees from AbbVie, Axsome Therapeutics, Big Health, Eisai, Evecxia, Harmony Biosciences, Idorsia, Janssen Pharmaceuticals, Jazz Pharmaceuticals, Neumora, Neurawell, Neurocrine Biosciences, Otsuka, Sage, and Takeda; and stock options in Big Health and Neurawell. The disclosed organizations had no role in any component of the conduct of the research. The authors disclose no other interests related to the content of this manuscript.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">DSM</term><def><p> Diagnostic and Statistical Manual</p></def></def-item><def-item><term id="abb4">DSM-5-TR</term><def><p> Diagnostic and Statistical Manual, Version 5, Text Revision</p></def></def-item><def-item><term id="abb5">FN</term><def><p>false negative</p></def></def-item><def-item><term id="abb6">FP</term><def><p>false positive</p></def></def-item><def-item><term id="abb7">GLMM</term><def><p> generalized linear mixed model</p></def></def-item><def-item><term id="abb8">ICC</term><def><p> intraclass correlation</p></def></def-item><def-item><term id="abb9">JSON</term><def><p> JavaScript Object Notation</p></def></def-item><def-item><term id="abb10">LLM</term><def><p> large language model</p></def></def-item><def-item><term id="abb11">LRM</term><def><p> large reasoning model</p></def></def-item><def-item><term id="abb12">PPV</term><def><p> positive predictive value</p></def></def-item><def-item><term id="abb13">SR</term><def><p> simulated reasoning</p></def></def-item><def-item><term id="abb14">SV</term><def><p>self-verification</p></def></def-item><def-item><term id="abb15">TP</term><def><p>true positive</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Villalobos</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sevilla</surname><given-names>J</given-names> </name><name name-style="western"><surname>Besiroglu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Heim</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hobbhahn</surname><given-names>M</given-names> </name></person-group><article-title>Position: will we run out of data? limits of LLM scaling based on human-generated data</article-title><conf-name>Proceedings of the 41st International Conference on Machine Learning Vienna</conf-name><conf-date>Jul 21-27, 2024</conf-date><conf-loc>Vienna, Austria</conf-loc><fpage>49523</fpage><lpage>49544</lpage><pub-id pub-id-type="doi">10.5555/3692070.3694094</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heinz</surname><given-names>MV</given-names> </name><name name-style="western"><surname>Mackin</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Trudeau</surname><given-names>BM</given-names> </name><etal/></person-group><article-title>Randomized trial of a generative AI chatbot for mental health treatment</article-title><source>NEJM AI</source><year>2025</year><month>03</month><day>27</day><volume>2</volume><issue>4</issue><fpage>AIoa2400802</fpage><pub-id pub-id-type="doi">10.1056/AIoa2400802</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hanss</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sarma</surname><given-names>KV</given-names> </name><name name-style="western"><surname>Glowinski</surname><given-names>AL</given-names> </name><etal/></person-group><article-title>Assessing the accuracy and reliability of large language models in psychiatry using standardized multiple-choice questions: cross-sectional study</article-title><source>J Med Internet Res</source><year>2025</year><month>05</month><day>20</day><volume>27</volume><issue>1</issue><fpage>e69910</fpage><pub-id pub-id-type="doi">10.2196/69910</pub-id><pub-id pub-id-type="medline">40392576</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sarma</surname><given-names>KV</given-names> </name><name name-style="western"><surname>Hanss</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Halls</surname><given-names>AJM</given-names> </name><etal/></person-group><article-title>Integrating expert knowledge into large language models improves performance for psychiatric reasoning and diagnosis</article-title><source>Psychiatry Res</source><year>2026</year><month>01</month><volume>355</volume><fpage>116844</fpage><pub-id pub-id-type="doi">10.1016/j.psychres.2025.116844</pub-id><pub-id pub-id-type="medline">41270691</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rasool</surname><given-names>A</given-names> </name><name name-style="western"><surname>Aslam</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hussain</surname><given-names>N</given-names> </name><name name-style="western"><surname>Imtiaz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riaz</surname><given-names>W</given-names> </name></person-group><article-title>nBERT: harnessing NLP for emotion recognition in psychotherapy to transform mental health care</article-title><source>Information</source><year>2025</year><month>04</month><volume>16</volume><issue>4</issue><fpage>301</fpage><pub-id pub-id-type="doi">10.3390/info16040301</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>King</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Nanda</surname><given-names>G</given-names> </name><name name-style="western"><surname>Stoddard</surname><given-names>J</given-names> </name><etal/></person-group><article-title>An introduction to generative artificial intelligence in mental health care: considerations and guidance</article-title><source>Curr Psychiatry Rep</source><year>2023</year><month>12</month><volume>25</volume><issue>12</issue><fpage>839</fpage><lpage>846</lpage><pub-id pub-id-type="doi">10.1007/s11920-023-01477-x</pub-id><pub-id pub-id-type="medline">38032442</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obradovich</surname><given-names>N</given-names> </name><name name-style="western"><surname>Khalsa</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Opportunities and Risks of Large Language Models in Psychiatry</article-title><source>NPP Digit Psychiatry Neurosci</source><year>2024</year><volume>2</volume><issue>1</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1038/s44277-024-00010-z</pub-id><pub-id pub-id-type="medline">39554888</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Volkmer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Meyer-Lindenberg</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schwarz</surname><given-names>E</given-names> </name></person-group><article-title>Large language models in psychiatry: Opportunities and challenges</article-title><source>Psychiatry Res</source><year>2024</year><month>09</month><volume>339</volume><fpage>116026</fpage><pub-id pub-id-type="doi">10.1016/j.psychres.2024.116026</pub-id><pub-id pub-id-type="medline">38909412</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Orr&#x00F9;</surname><given-names>G</given-names> </name><name name-style="western"><surname>Melis</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sartori</surname><given-names>G</given-names> </name></person-group><article-title>Large language models and psychiatry</article-title><source>Int J Law Psychiatry</source><year>2025</year><volume>101</volume><fpage>102086</fpage><pub-id pub-id-type="doi">10.1016/j.ijlp.2025.102086</pub-id><pub-id pub-id-type="medline">40020592</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clarke</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Narrow</surname><given-names>WE</given-names> </name><name name-style="western"><surname>Regier</surname><given-names>DA</given-names> </name><etal/></person-group><article-title>DSM-5 Field Trials in the United States and Canada, Part I: study design, sampling strategy, implementation, and analytic approaches</article-title><source>AJP</source><year>2013</year><month>01</month><volume>170</volume><issue>1</issue><fpage>43</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.1176/appi.ajp.2012.12070998</pub-id><pub-id pub-id-type="medline">24200851</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Regier</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Narrow</surname><given-names>WE</given-names> </name><name name-style="western"><surname>Clarke</surname><given-names>DE</given-names> </name><etal/></person-group><article-title>DSM-5 field trials in the United States and Canada, Part II: test-retest reliability of selected categorical diagnoses</article-title><source>Am J Psychiatry</source><year>2013</year><month>01</month><volume>170</volume><issue>1</issue><fpage>59</fpage><lpage>70</lpage><pub-id pub-id-type="doi">10.1176/appi.ajp.2012.12070999</pub-id><pub-id pub-id-type="medline">23111466</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rasool</surname><given-names>A</given-names> </name><name name-style="western"><surname>Surabhi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Aiding large language models using clinical scoresheets for neurobehavioral diagnostic classification from text: algorithm development and validation</article-title><source>JMIR AI</source><year>2025</year><month>10</month><day>21</day><volume>4</volume><issue>1</issue><fpage>e75030</fpage><pub-id pub-id-type="doi">10.2196/75030</pub-id><pub-id pub-id-type="medline">41118647</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Is chain-of-thought reasoning of llms a mirage? A data distribution lens</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 2, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2508.01191</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Benton</surname><given-names>J</given-names> </name><name name-style="western"><surname>Radhakrishnan</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Reasoning models don&#x2019;t always say what they think</article-title><source>arXiv</source><comment>Preprint posted online on  May 8, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.05410</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Alizadeh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Horton</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>S</given-names> </name><name name-style="western"><surname>Farajtabar</surname><given-names>M</given-names> </name></person-group><source>The illusion of thinking: understanding the strengths and limitations of reasoning models via the lens of problem complexity</source><year>2025</year><access-date>2025-06-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ml-site.cdn-apple.com/papers/the-illusion-of-thinking.pdf">https://ml-site.cdn-apple.com/papers/the-illusion-of-thinking.pdf</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>OpenAI</collab></person-group><source>Building more helpful ChatGPT experiences for everyone</source><year>2025</year><access-date>2025-09-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/building-more-helpful-chatgpt-experiences-for-everyone/">https://openai.com/index/building-more-helpful-chatgpt-experiences-for-everyone/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Esmail</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>RS</given-names> </name><etal/></person-group><article-title>Large language model performance and clinical reasoning tasks</article-title><source>JAMA Netw Open</source><year>2026</year><month>04</month><day>1</day><volume>9</volume><issue>4</issue><fpage>e264003</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2026.4003</pub-id><pub-id pub-id-type="medline">41973425</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McCoy</surname><given-names>LG</given-names> </name><name name-style="western"><surname>Swamy</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sagar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Assessment of large language models in clinical reasoning: a novel benchmarking study</article-title><source>NEJM AI</source><year>2025</year><month>09</month><day>25</day><volume>2</volume><issue>10</issue><fpage>AIdbp2500120</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2500120</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Arora</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hicks</surname><given-names>RS</given-names> </name><etal/></person-group><article-title>HealthBench: evaluating large language models towards improved human health</article-title><source>arXiv</source><comment>Preprint posted online on  May 13, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.08775</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Quantifying the reasoning abilities of LLMs on clinical cases</article-title><source>Nat Commun</source><year>2025</year><month>11</month><day>6</day><volume>16</volume><issue>1</issue><fpage>9799</fpage><pub-id pub-id-type="doi">10.1038/s41467-025-64769-1</pub-id><pub-id pub-id-type="medline">41198657</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Long</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Large language models for psychiatric diagnosis based on multicenter real-world clinical records: comparative study</article-title><source>JMIR Med Inform</source><year>2026</year><month>01</month><day>13</day><volume>14</volume><fpage>e77699</fpage><pub-id pub-id-type="doi">10.2196/77699</pub-id><pub-id pub-id-type="medline">41408781</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Och</surname><given-names>FJ</given-names> </name></person-group><article-title>Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics</article-title><year>2004</year><conf-name>Proceedings of the 42nd Annual Meeting on Association for Computational Linguistics</conf-name><conf-date>Jul 21-26, 2004</conf-date><conf-loc>Barcelona, Spain</conf-loc><fpage>605</fpage><lpage>es</lpage><pub-id pub-id-type="doi">10.3115/1218955.1219032</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Papineni</surname><given-names>K</given-names> </name><name name-style="western"><surname>Roukos</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ward</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>WJ</given-names> </name></person-group><article-title>BLEU</article-title><conf-name>Proceedings of the 40th Annual Meeting on Association for Computational Linguistics USA</conf-name><conf-date>Jul 7-12, 2002</conf-date><pub-id pub-id-type="doi">10.3115/1073083.1073135</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pakhomov</surname><given-names>S</given-names> </name><name name-style="western"><surname>Heagerty</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Perplexity and proximity: Large language model perplexity complements semantic distance metrics for the detection of incoherent speech</article-title><source>J Biomed Inform</source><year>2025</year><month>10</month><volume>170</volume><fpage>104899</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2025.104899</pub-id><pub-id pub-id-type="medline">40849054</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Methods for diagnosis simplification and matching.</p><media xlink:href="ai_v5i1e83927_app1.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Detailed scoring charts.</p><p>This spreadsheet contains one tab with detailed scoring information for each combination of model and inference approach. The columns are as follows:</p><p>case_number: Vignette number from the DSM-5-TR Clinical Cases handbook.</p><p>answers_dx: JSON string array containing the author-designated diagnoses</p><p>model_dx: JSON string array containing the model predicted diagnoses</p><p>dx_TP: Count of true positive diagnoses</p><p>dx_TP_list: JSON string array containing the true positive diagnoses</p><p>dx_FP: Count of false positive diagnoses</p><p>dx_FP_list: JSON string array containing the false positive diagnoses</p><p>dx_FN: Count of false negative diagnoses</p><p>dx_FN_list: JSON string array containing the false negative diagnoses</p><p>dx_recall: The computed recall metric for this vignette</p><p>dx_precision: The computed precision metric for this vignette.</p><media xlink:href="ai_v5i1e83927_app2.xlsx" xlink:title="XLSX File, 60 KB"/></supplementary-material></app-group></back></article>