<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e69149</article-id><article-id pub-id-type="doi">10.2196/69149</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Digital Phenotyping for Detecting Depression Severity in a Large Payor-Provider System: Retrospective Study of Speech and Language Model Performance</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Karlin</surname><given-names>Bradley</given-names></name><degrees>MSCP, MBA, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Henry</surname><given-names>Doug</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Anderson</surname><given-names>Ryan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cieri</surname><given-names>Salvatore</given-names></name><degrees>LCSW</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Aratow</surname><given-names>Michael</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shriberg</surname><given-names>Elizabeth</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hoy</surname><given-names>Michelle</given-names></name><degrees>LPC</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Highmark Health</institution><addr-line>120 Fifth Avenue, Fifth Avenue Place</addr-line><addr-line>Pittsburgh</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Johns Hopkins Bloomberg School of Public Health</institution><addr-line>Baltimore</addr-line><addr-line>MD</addr-line><country>United States</country></aff><aff id="aff3"><institution>Ellipsis Health</institution><addr-line>548 Market Street, PMB 49051</addr-line><addr-line>San Francisco</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Huo</surname><given-names>Yuankai</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Meyer</surname><given-names>Denny</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Adegoke</surname><given-names>Kola</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Michael Aratow, MD, Ellipsis Health, 548 Market Street, PMB 49051, San Francisco, CA, 94104-5401, United States, 1 800-410-5383; <email>mike@ellipsishealth.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>19</day><month>6</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e69149</elocation-id><history><date date-type="received"><day>25</day><month>11</month><year>2024</year></date><date date-type="rev-recd"><day>30</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>31</day><month>03</month><year>2025</year></date></history><copyright-statement>&#x00A9; Bradley Karlin, Doug Henry, Ryan Anderson, Salvatore Cieri, Michael Aratow, Elizabeth Shriberg, Michelle Hoy. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 19.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e69149"/><abstract><sec><title>Background</title><p>There is considerable need to improve and increase the detection and measurement of depression. The use of speech as a digital biomarker of depression represents a considerable opportunity for transforming and accelerating depression identification and treatment; however, research to date has primarily consisted of small-sample feasibility or pilot studies incorporating highly controlled applications and settings. There has been limited examination of the technology in real-world use contexts.</p></sec><sec><title>Objective</title><p>This study evaluated the performance of a machine learning (ML) model examining both semantic and acoustic properties of speech in predicting depression across more than 2000 real-world interactions between health plan members and case managers.</p></sec><sec sec-type="methods"><title>Methods</title><p>A total of 2086 recordings of case management calls with verbally administered Patient Health Questionnaire&#x2014;9 questions (PHQ-9) surveys were analyzed using the ML model after the portions of the recordings with the PHQ-9 survey were manually redacted. The recordings were divided into a Development Set (Dev Set) (n=1336) and a Blind Set (n=671), and Patient Health Questionnaire&#x2014;8 questions (PHQ-8) scores were provided for the Dev Set for ML model refinement while PHQ-8 scores from the Blind Set were withheld until after ML model depression severity output was reported.</p></sec><sec sec-type="results"><title>Results</title><p>The Dev Set and the Blind Set were well matched for age (Dev Set: mean 53.7, SD 16.3 years; Blind Set: mean 51.7, SD 16.9 years), gender (Dev Set: 910/1336, 68.1% of female participants; Blind Set: 462/671, 68.9% of female participants), and depression severity (Dev Set: mean 10.5, SD 6.1 of PHQ-8 scores; Blind Set: mean 10.9, SD 6.0 of PHQ-8 scores). The concordance correlation coefficient was &#x03C1;<sub>c</sub>=0.57 for the test of the ML model on the Dev Set and &#x03C1;<sub>c</sub>=0.54 on the Blind Set, while the mean absolute error was 3.91 for the Dev Set and 4.06 for the Blind Set, demonstrating strong model performance. This performance was maintained when dividing each set into subgroups of age brackets (&#x2264;39, 40&#x2010;64, and &#x2265;65 years), biological sex, and the 4 categories of Social Vulnerability Index (an index based on 16 social factors), with concordance correlation coefficients ranging as &#x03C1;<sub>c</sub>=0.44&#x2010;0.61. Performance at PHQ-8 threshold score cutoffs of 5, 10, 15, and 20, representing the depression severity categories of none, mild, moderate, moderately severe, and severe (&#x2265;20), respectively, expressed as area under the receiver operating characteristic curve values, varied between 0.79 and 0.83 in both the Dev and Blind Sets.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Overall, the findings suggest that speech may have significant potential for detection and measurement of depression severity over a variety of ages, gender, and socioeconomic categories that may enhance treatment, improve clinical decision-making, and enable truly personalized treatment recommendations.</p></sec></abstract><kwd-group><kwd>depression</kwd><kwd>vocal biomarkers</kwd><kwd>artificial intelligence</kwd><kwd>behavioral health</kwd><kwd>machine learning</kwd><kwd>health care case management</kwd><kwd>mobile phone</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>The prevalence and impact of behavioral health (BH) problems are at an all-time high. As many as 1 in 3 individuals throughout the United States have a BH condition [<xref ref-type="bibr" rid="ref1">1</xref>]. Rates of subclinical needs are even higher, fueled in part by the psychological and social effects of the pandemic; as many as 1 in 2 individuals reports 1 or more symptoms of depression or anxiety [<xref ref-type="bibr" rid="ref2">2</xref>]. In fact, the prevalence of depression symptoms increased more than 3-fold during COVID-19 [<xref ref-type="bibr" rid="ref3">3</xref>]. At the same time, only 40% of those with BH conditions, and even fewer with subclinical needs, receive care of any kind, due to challenges with and delays in detection, perceived need, stigma, a paucity of providers, and other factors, and less than 15% of individuals with serious BH conditions receive minimally adequate treatment [<xref ref-type="bibr" rid="ref4">4</xref>]. For those who do receive care, there is an average lag time of 11 years from the time of symptom onset to first treatment, during which time symptoms often worsen and other comorbidities may develop [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>The current state of BH care and high levels of unmet need reflect a reactive and downstream approach to the identification and treatment of BH problems that has characterized the industry for decades. Effectively and efficiently meeting BH needs requires a more proactive, upstream, and personalized approach that meets individual needs earlier in their clinical trajectories with right-sized and person-fit interventions [<xref ref-type="bibr" rid="ref5">5</xref>]. Emerging innovations in data science and technology, particularly developments in advanced data analytics and the availability of high-quality, patient-driven digital interventions, present unprecedented opportunities to transform and innovate the field of BH care and reduce enduring, high rates of unmet need.</p><p>One particularly promising innovation for advancing detection and delivery of proactive, personalized, and data-informed treatment is digital phenotyping. Digital phenotyping involves the detection of phenotypes, or behavioral signals, that may indicate or predict the presence of a BH problem. Translated by machine learning (ML) models and collected through passive data collection via smartphones, wearables, or other communication devices, these data signals may serve as clinically, and potentially preclinically, useful markers of BH problems. The potential relevance and use of digital phenotyping, which has been identified as the &#x201C;next frontier&#x201D; for personalized and proactive care within the field of oncology [<xref ref-type="bibr" rid="ref6">6</xref>], have attracted particular interest and attention in the field of BH care and personalized psychiatry, with recent calls for accelerated applications to clinical practice [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. In their review of research in this area, Huckvale et al [<xref ref-type="bibr" rid="ref7">7</xref>] declared, &#x201C;Many...studies appear to anticipate that digital phenotyping should play a role in routine clinical practice, for example by enhancing aspects of clinical diagnosis and treatment through earlier detection of condition onset, relapse or treatment response.&#x201D;</p><p>Most of the research examining digital phenotyping for the detection of BH problems has focused on detection of depression [<xref ref-type="bibr" rid="ref10">10</xref>]. The opportunity to engage passive and objective ML technology for better detecting depression presents particular opportunities in light of the fact that depression is undetected in approximately 50% of individuals with the condition in high-income countries, and in 80%&#x2010;90% of individuals with depression in low- and middle-income countries [<xref ref-type="bibr" rid="ref11">11</xref>]. In addition to opportunities that automated detection of depression provides for increasing low screening rates in most clinical and community settings, ML presents significant promise for overcoming underreporting and underdetection due to stigma, lack of evaluative service access, misattribution of symptoms to physical illness or age-related factors, or underrecognition of symptoms. In addition, the use of ML for detecting depression offers significant potential for increasing <italic>earlier</italic> identification and intervention, enhancing clinical efficiency through more accurate triage and treatment performance monitoring, improving fidelity through the use of objective measures, providing decision support, and personalizing BH care. As Galatzer-Levy and Onnela [<xref ref-type="bibr" rid="ref12">12</xref>] recently declared, &#x201C;Ultimately, the development of clinically meaningful digital measurements and their implementation in real-world contexts will permit optimized and personalized treatments targeted to the individual&#x2019;s emergent presentation and needs.&#x201D;</p></sec><sec id="s1-2"><title>Prior Work</title><p>Among the most promising applications of digital phenotyping is the use of speech as a vocal biomarker of depression and other BH conditions [<xref ref-type="bibr" rid="ref13">13</xref>]. The application of speech analysis in this context includes models for moment-by-moment analysis of the semantic patterns (&#x201C;what&#x201D; is said) or the acoustic properties (eg, tone, pitch, loudness, duration, articulation, transitions, and prosody) of speech, or the application of both. Increasing research has demonstrated the promise of speech analysis, including generally increasing accuracy in overall detection of depression and other conditions [<xref ref-type="bibr" rid="ref7">7</xref>]. Despite this promise, research to date has been primarily conducted in controlled contexts and uses, and there has been very limited examination or application of this technology in real-world settings [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. As Koutsouleris et al [<xref ref-type="bibr" rid="ref14">14</xref>] recently noted, &#x201C;While these innovations promise to revolutionize health care, little progress has been made toward real precision mental health applications. Implementation of these applications is often an afterthought.&#x201D;</p><p>Research on the use of speech analysis for measuring BH symptoms has consisted primarily of small-sample feasibility or pilot studies with nonrepresentative samples [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. For example, in a scoping review of speech analysis for measuring mood disorders conducted by Flanagan et al [<xref ref-type="bibr" rid="ref15">15</xref>], approximately 80% of studies were pilot or feasibility studies with sample sizes ranging from 1 to 73 participants. Similarly, in their review, Chia and Zhang [<xref ref-type="bibr" rid="ref10">10</xref>] reported a mean sample size of 60. Moreover, many studies have consisted of analysis of &#x201C;toy&#x201D; datasets or controlled proof-of-concept studies involving highly controlled designs that, while promising for establishing the potential of a technological innovation, have yielded findings that are not necessarily generalizable or have use or effectiveness for real-world use [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. These designs include use of analog speech tasks (eg, responding to a singular question, reading formulated passages, and answering questions about everyday life, often referred to as &#x201C;closed-form&#x201D; tests) that are often not comparable with real-world clinical settings or real-life contexts.</p><p>In addition, many studies examining speech analysis in the BH context have had important methodological limitations, including frequent reporting of selected metrics, such as reporting of sensitivity without specificity, leading to the recent call for research in this area to report multiple metrics, including robust metrics, such as the concordance correlation coefficient (CCC), that are not as biased to specific context, use case, and data label distributions [<xref ref-type="bibr" rid="ref13">13</xref>]. Many studies have also relied on binary classifications (above or below cutoff score for clinical significance) for screening tools, which limit opportunities for promoting precision and personalization in BH care. Furthermore, research on speech analysis in detecting and measuring BH symptoms has almost exclusively relied on the use of single methods of analysis (predominantly acoustic analysis). Opportunities for leveraging and combining analysis of acoustic and semantic properties of speech may yield greater accuracy and precision in detecting and predicting BH conditions.</p></sec><sec id="s1-3"><title>Goal of This Study</title><p>As mentioned previously, the application of digital phenotyping within BH care has approached a defining moment and key turning point for the field. In their review of the current state of digital phenotyping within the field of BH, Huckvale et al [<xref ref-type="bibr" rid="ref7">7</xref>] have urged for &#x201C;practical and coordinated action...to help accelerate both research and the ultimate development of real-world health applications for digital phenotyping.&#x201D; In an effort to help advance real-world application of digital phenotyping for promoting earlier and automated detection and measurement of depression, this study evaluated the performance of an ML model of the semantic and acoustic properties of spoken language in predicting depression in a naturalistic context by analyzing more than 2000 interactions between health plan members and case managers. Additionally, the study sought to test model performance beyond &#x201C;presence or absence&#x201D; dichotomous predictions, examining classificatory accuracy at multiple levels of depression from none or minimal to severe. Furthermore, model performance was tested across age, sex, and sociodemographic factors and in BH and non-BH case management contexts. This project, which is unique in its breadth and scope, aims to assess the accuracy of speech analysis for detecting and measuring depression severity in routine clinical settings. We hypothesize that the ML models used in this study will demonstrate robust predictive accuracy across variations in age, gender, care management context, and Social Vulnerability Index (SVI).</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Experimental Design</title><p>The current quality improvement project evaluated the performance of the combined semantic-acoustic ML speech analysis model in predicting depression severity from existing recordings of case management calls, with BH case managers who are licensed independent mental health providers. Specifically, the performance of the ML model in care management conversations between insured members and BH case managers was evaluated by retrospectively comparing the actual scores from the Patient Health Questionnaire&#x2014;9 questions (PHQ-9) administered by the case managers. The predicted Patient Health Questionnaire&#x2014;8 questions (PHQ-8) scores were derived from the qualities (acoustic biomarkers and semantic content) of vocal productions of the same members conversing with care managers while engaged in discussion other than the PHQ-8 administration. It was secondarily sought to examine model performance in non-BH contexts where the PHQ-9 is not routinely administered using a subsample of calls with non-BH case managers. For both BH and non-BH calls, model predictions were compared with PHQ-8 scores from an associated metadata file.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>On each of the calls analyzed, the PHQ-9 was verbally administered. Members consented to the recording of the call for quality and training purposes. This study was designated as a quality improvement project by the institutional review board of the Allegheny Health Network and therefore exempt from ongoing institutional review board oversight. The project was also reviewed and approved by the Highmark Health Enterprise Data Governance Committee to ensure that it comported with internal data protection standards and applicable privacy, legal, and regulatory requirements, including deidentification of data. There was no compensation provided as recordings were made in the normal course of business.</p></sec><sec id="s2-3"><title>Measures</title><sec id="s2-3-1"><title>Depression Severity</title><p>The PHQ-9 is a widely used self-report measure of depression symptom severity. Frequency of depression symptoms are endorsed by patients using a 4-point scale, ranging from 0 (&#x201C;Not at all&#x201D;) to 3 (&#x201C;Nearly every day&#x201D;). PHQ-9 scores range from 0 to 27. Higher scores reflect greater depression severity. Scores of 0-4 are classified as &#x201C;none to minimal,&#x201D; 5-9 are classified as &#x201C;mild,&#x201D; 10-14 are classified as &#x201C;moderate,&#x201D; 15-19 are classified as &#x201C;moderately severe,&#x201D; and 20-27 are classified as &#x201C;severe.&#x201D; The PHQ-9 has been shown to be an internally consistent, valid, and reliable measure of depression severity [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. For this study, the last item of the PHQ-9, which assesses for suicidal or self-injurious thoughts, was omitted given the use of archival data where further probing of responses was not feasible. Its inclusion requires different clinical considerations and handling in research settings. The adapted scale with item 9 removed is referred to as the PHQ-8 and has been shown to have strong psychometric characteristics, including the ability to accurately predict depression [<xref ref-type="bibr" rid="ref20">20</xref>].</p></sec><sec id="s2-3-2"><title>ML Speech Analysis Model</title><p>The semantic-acoustic model evaluated in this study has demonstrated robust results for accurate prediction of depression symptom severity and acceptable rates of error [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. The proprietary ML model includes both acoustic and semantic models. The acoustic model takes as input the raw speech signal (rather than precomputed features such as pitch or energy). The production acoustic workflow is built on a pretrained open-source wav2vec2 architecture [<xref ref-type="bibr" rid="ref24">24</xref>] and is trained on proprietary audio data. The system consists of 4 segment models, each trained with specific configurations, and 3 segment fusion models that integrate outputs from the segment networks. Predictions from the segment fusion models are weighted to generate the final acoustic score.</p><p>The semantic model (referred to also as a natural language processing model) takes as input the output of a commercial automatic speech recognition (ASR) system. The model is based on the Longformer architecture [<xref ref-type="bibr" rid="ref25">25</xref>], designed to efficiently handle long conversational contexts using advanced mechanisms such as dilated sliding window attention. Model training involves a proprietary fine-tuning approach using depression-specific data, using high-quality proprietary transcripts paired with PHQ scores. Further refinement is conducted using conversational samples, also labeled with PHQ scores. Labeled training data come from a large corpus of proprietary spoken language datasets labeled with PHQ-8 values. Both models take advantage of publicly available data for model pretraining, including text corpora for the natural language processing model.</p><p>To generate the final depression severity prediction, the outputs of the acoustic and language model are combined using a linear weighting; the weight is optimized using the CCC metric on the Development set. <xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the overall ML analysis, from data preprocessing to prediction generation.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Deep learning architecture and processing pipeline. PHI: Protected health information; PII: personally identifiable information.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e69149_fig01.png"/></fig></sec></sec><sec id="s2-4"><title>Identification of Case Management Calls and Metadata</title><p>A total of 2626 recordings of case management calls were included. They took place between January 2019 and January 2023. Of these calls, 2083 had full item-level data for the first 8 items of the PHQ-9, which were collected verbally during the course of calls. Calls corresponded to unique members from 44 different US states and were completed by 46 case managers. The majority of case managers completed multiple calls, and approximately one-third completed 20 or more calls. Each call recording had an associated metadata file containing member age, biological sex, zip code, whether the call was conducted by a BH case manager or non-BH (eg, medical and surgical) case manager, and PHQ-9 item-level data. Exclusion criteria for recordings included member age less than 18 years, speechmail messages, presence of any speakers beyond the case manager and the member, recordings in which the member was not present, and diarization failures (failures to correctly segment audio into single-speaker time regions). These exclusion criteria constituted 76 of the 2083 calls, leaving a total of 2007 calls for the analysis.</p></sec><sec id="s2-5"><title>Partitioning the Data</title><p>The evaluation was conducted in 2 phases. To establish datasets for both phases of the project, the 2007 recordings and their metadata were partitioned by randomly assigning them to a development dataset (Dev Set) consisting of approximately two-thirds of the total available calls (n=1336) and a test dataset (Blind Set) consisting of approximately one-third of the total available calls (n=671). There was no speaker overlap across these datasets. The partitions were constructed to ensure reasonably equal representation of the metadata, including the distribution of PHQ-9 severity (none, mild, moderate, moderately severe, and severe). The Dev Set and Blind Set were securely delivered via secure file transfer protocol to Ellipsis Health, which performed all further data processing and analyses of the calls and metadata. The Blind Set was held back until phase 1 was completed.</p><p>The 2 datasets (Dev and Blind) were well matched; however, there were data curation errors such as inclusion of voicemails, conversations in a different language (mostly involving a language interpreter), and minors (younger than 18 years). Subsequent to the delivery of each dataset, 76 recordings, 44 from the Dev Set and 32 from the Blind Set, were found to meet exclusionary criteria through review of metadata (ie, age) and through diarization tool flags indicating a single speaker or more than 2 speakers. These 76 recordings were removed from the analyses. However, because the audio tracks were not reviewed by the annotators, other recordings meeting exclusionary criteria were included in the Dev and Blind Sets. An analysis was conducted, and it is estimated that inclusion errors constituted 3% of the total recordings analyzed.</p></sec><sec id="s2-6"><title>Data Preprocessing</title><p>Upon receiving call recordings, Ellipsis Health performed diarization, ASR, and redaction of personally identifiable information using Amazon Web Services Amazon Transcribe [<xref ref-type="bibr" rid="ref26">26</xref>]. Redaction of protected health information was performed using Amazon Comprehend Medical [<xref ref-type="bibr" rid="ref27">27</xref>]. Speaker role detection and time stamp generation on turns in conversation (ie, transition from member to case manager and vice versa) were performed using proprietary algorithms from Ellipsis Health. The redacted output of the ASR process, which included transcripts of both the member and the case manager with PHQ-9 content removed, was used by the semantic model. Meanwhile, the diarized, redacted audio containing only the member&#x2019;s speech, with PHQ-9 content masked, was used by the acoustic model. The outputs of the semantic and acoustic models were used (after weighting) to arrive at the fused output, using the Dev dataset.</p></sec><sec id="s2-7"><title>Manual Annotation</title><p>To remove the verbally administered PHQ-9 from the calls, a manual annotation process was performed to identify the regions in the transcripts where the PHQ-9 was administered. Ellipsis Health used a team of professionals separate from the team conducting tests of the ML model (ie, ML team) to perform manual annotation, which consisted of annotators being presented with the case manager portion of the transcript from each call and having them identify the regions that contained the PHQ questions. These annotated regions of audio samples were masked in white noise for the acoustic model analysis, and the corresponding text was removed from the transcripts for the semantic model analysis.</p></sec><sec id="s2-8"><title>Model Refinement on the Dev Set</title><p>In phase 1, the semantic-acoustic model was applied to the Dev Set (n=1336), and hyperparameters (eg, learning rate in optimization algorithms, number of hidden layers, and number of iterations in training a neural network) were optimized to minimize the CCC [<xref ref-type="bibr" rid="ref28">28</xref>].</p></sec><sec id="s2-9"><title>Tests on the Blind Set</title><p>Phase 2 of the project was conducted to evaluate the performance of the semantic-acoustic ML model established in phase 1. The Blind Set used in phase 2 was provided to Ellipsis Health without any accompanying PHQ-9 scores to ensure a blinded test of the model. Other than absence of PHQ-9 scores, the provided metadata categories were the same categories as provided for the Dev Set. ASR was conducted, followed by personally identifiable information and protected health information redaction of both the audio and transcript, using the same process as for the Dev Set. Manual annotation of the recordings was performed as in phase 1 and the verbally administered PHQ-9 was masked in the audio file and removed from the call transcript. The ML team then conducted tests of the ML model to predict depression severity scores for the Blind Set and across the metadata subgroups of the Blind Set. Recorded PHQ-9 scores from the original calls in the Blind Set were subsequently provided to the ML team, and PHQ-8 scores were then derived from these PHQ-9 scores and then compared with ML model predictions of the PHQ-8 scores to complete the test of model accuracy.</p><p>In light of the fact that overreporting and underreporting are well-known phenomena of surveys, including on sensitive measures such as the PHQ-9 [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>], a preliminary exploration of the possible presence of such when responding to the PHQ-9 was conducted by examining for sizeable discrepancies between PHQ-8 labels and predicted depression scores. Overreporting and underreporting were defined as a difference of &#x2265;2 categories of classification between the model prediction and the PHQ-8 score, as this would likely cause a significant change in a care pathway for a patient, and this condition was found in 42 of the 2007 total recordings. Five licensed therapists were recruited to listen and rate the member for severity of depression symptoms (none, mild, moderate, and severe). They were assigned recordings such that 1 therapist listened to each of the 42 recordings, but in 25 of those calls at least 2 therapists provided an additional assessment. The therapists were blinded to all information, including the PHQ score and section of the recording where the survey was administered, the model predictions, and demographic information. A PHQ-8 score predicted by ML model was defined as agreeing with a mental health provider assessment if their assessment was the same or within 1 severity category difference.</p></sec><sec id="s2-10"><title>Metrics</title><p>The ML model results included CCC, mean absolute error (MAE), area under the receiver operating characteristic curve (AUROC), and sensitivity and specificity at the point of equal error rate (EER) for the Dev Set and the Blind Set. All classification analyses were conducted with the PHQ-8 as the criterion or observed score. Predicted scores from the ML regression models were binned according to the following PHQ-8 depression severity classifications: none or minimal (0&#x2010;4), mild (5-9), moderate (10-14), moderately severe (15-19), and severe (20-24). Next, ROC analyses were conducted, comparing predicted to observed scores across 5 binary classifications at the 4 PHQ-8 cutoffs (5, 10, 15, and 20): 0-4 versus 5-24, 0-9 versus 10-24, 0-14 versus 15-24, and 0-19 versus 20-24. AUROC and sensitivity and specificity at the EER were calculated at each cutoff and reported for the ML model.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Comparison of Member-Level Metadata</title><p>Data demographic distributions for members were comparable across both datasets. Ages ranged from 18-98 years in the Dev Set and 18-92 years in the Blind Set (<xref ref-type="table" rid="table1">Table 1</xref>). Approximately two-thirds of members across the Dev Set (910/1336, 68.1% of participants) and Blind Set (462/671, 68.9% of participants) were female. Member zip code was used to establish the SVI [<xref ref-type="bibr" rid="ref31">31</xref>], which is based on 16 social factors, including socioeconomic status (eg, below poverty and unemployed), household characteristics (eg, single parent and aged 65 years or older), and housing type or transportation (eg, crowding and no vehicle). In each dataset, members were predominantly in the low-moderate range for social vulnerability and the majority of calls were BH case management calls (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of metadata for the Dev and Blind Sets.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metadata</td><td align="left" valign="bottom">Dev Set (n=1336)</td><td align="left" valign="bottom">Blind Set (n=671)</td></tr></thead><tbody><tr><td align="left" valign="top">Age (years), mean (SD), range</td><td align="left" valign="top">53.7 (16.3), 18&#x2010;98</td><td align="left" valign="top">51.7 (16.9), 18&#x2010;92</td></tr><tr><td align="left" valign="top">Age (years), n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x2264;39</td><td align="left" valign="top">296 (22.2)</td><td align="left" valign="top">183 (27.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>40&#x2010;64</td><td align="left" valign="top">704 (52.7)</td><td align="left" valign="top">344 (51.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x2265;65</td><td align="left" valign="top">336 (25.1)</td><td align="left" valign="top">144 (21.4)</td></tr><tr><td align="left" valign="top">Biological sex, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">910 (68.1)</td><td align="left" valign="top">462 (68.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">426 (31.9)</td><td align="left" valign="top">207 (30.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Undefined</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">2 (0.3)</td></tr><tr><td align="left" valign="top">SVI<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">279 (20.9)</td><td align="left" valign="top">150 (15.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">617 (46.2)</td><td align="left" valign="top">306 (45.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">335 (25.1)</td><td align="left" valign="top">162 (24.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">102 (7.6)</td><td align="left" valign="top">51 (7.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Missing</td><td align="left" valign="top">3 (2.2)</td><td align="left" valign="top">2 (0.3)</td></tr><tr><td align="left" valign="top">Type of CM<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BH<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">1087 (81.4)</td><td align="left" valign="top">561 (83.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Non-BH</td><td align="left" valign="top">249 (18.6)</td><td align="left" valign="top">110 (16.4)</td></tr><tr><td align="left" valign="top">PHQ-8<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup>, mean (SD), range</td><td align="left" valign="top">10.5 (6.1), 0&#x2010;24</td><td align="left" valign="top">10.9 (6.0), 0&#x2010;24</td></tr><tr><td align="left" valign="top">PHQ-8<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup>, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None or minimal</td><td align="left" valign="top">249 (18.6)</td><td align="left" valign="top">113 (16.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mild</td><td align="left" valign="top">384 (28.7)</td><td align="left" valign="top">179 (26.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Moderate</td><td align="left" valign="top">328 (25.6)</td><td align="left" valign="top">174 (25.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Moderately severe</td><td align="left" valign="top">263 (19.7)</td><td align="left" valign="top">141 (21.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Severe</td><td align="left" valign="top">112 (8.4)</td><td align="left" valign="top">64 (9.5)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>SVI: Social Vulnerability Index (1 = least vulnerable, 4 = most vulnerable).</p></fn><fn id="table1fn2"><p><sup>b</sup>CM: case management.</p></fn><fn id="table1fn3"><p><sup>c</sup>BH: behavioral health.</p></fn><fn id="table1fn4"><p><sup>d</sup>PHQ-8: Patient Health Questionnaire&#x2014;8 questions.</p></fn><fn id="table1fn5"><p><sup>e</sup>None or minimal=0&#x2010;4, mild=5&#x2010;9, moderate=10&#x2010;14, moderately severe=15&#x2010;19, and severe=20&#x2010;24. Percentages may not add up to 100% due to rounding.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Regression Results for Overall Tests of the ML Model</title><p>Results for the test of the ML model on the Dev Set (n=1336) produced a CCC of &#x03C1;<sub>c</sub>=0.57, which is superior to results expected by chance (&#x03C1;<sub>c</sub>=0.10-0.20). CCC showed minimal decrease in the test of the ML model on the Blind Set (&#x03C1;<sub>c</sub>=0.54; n=671). Furthermore, MAE values for the ML model tests across datasets were 3.91 and 4.06 for the Dev Set and Blind Set, respectively. These values for MAE are equivalent to less than the score range (5 points) of a single PHQ-8 severity classification.</p></sec><sec id="s3-3"><title>Classification Results for Overall Tests of the ML Model</title><p>The AUROC at PHQ-8 cutoff of 10 (ie, &#x201C;moderate&#x201D; depression, the traditional cutoff for the majority of clinical care pathways [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]) was consistent for the ML model as applied to the Dev Set (0.83) and Blind Set (0.81), which are identical to the respective mean AUROC values over the different cutoff points (<xref ref-type="table" rid="table2">Table 2</xref>, top panel). In particular, results for the ML model (AUROC=0.81) on the Blind Set indicate the robustness of the model in its ability to identify individuals with PHQ-8 scores above 10 using novel call data (ie, data without PHQ-8 labels and not previously used for model refinement).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Regression and classification results for overall tests of the machine learning model.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Statistic</td><td align="left" valign="top">Dev Set (n=1336)</td><td align="left" valign="top">Blind Set (n=671)</td></tr></thead><tbody><tr><td align="left" valign="top">CCC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.54</td></tr><tr><td align="left" valign="top">MAE<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">3.91</td><td align="left" valign="top">4.06</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;Mean<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> across cutoffs<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.81</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 5</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.85</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 10</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.81</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 15</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.79</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 20</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.79</td></tr><tr><td align="left" valign="top">Sens=Spec<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top">&#x2003;Mean across cutoffs</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.73</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 5</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.76</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 10</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.72</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 15</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.72</td></tr><tr><td align="left" valign="top">&#x2003;Cutoff 20</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.72</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table2fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn><fn id="table2fn3"><p><sup>c</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table2fn4"><p><sup>d</sup>Mean across 4 cutoffs (5, 10, 15, and 20).</p></fn><fn id="table2fn5"><p><sup>e</sup>Cutoff numbers were chosen as these are the points where the depression severity category boundaries occur.</p></fn><fn id="table2fn6"><p><sup>f</sup>Value of both sensitivity and specificity at point of equal error.</p></fn></table-wrap-foot></table-wrap><p>AUROC values across the 4 cutoff thresholds and across both datasets ranged from 0.79 to 0.85 (<xref ref-type="table" rid="table2">Table 2</xref>). The lowest AUROCs were observed for the ML model (AUROC=0.79) on the Blind Set at PHQ-8 cutoffs of 15 and 20 (&#x201C;moderately severe&#x201D; and &#x201C;severe&#x201D; depression). Of note, the size of the subsample of members in the Blind Set with scores &#x2265;20 was only 64 members and may have contributed to the lower AUROC values for this classification. See <xref ref-type="table" rid="table1">Table 1</xref> for information on sample sizes across classifications for all 3 datasets.</p><p>As shown in <xref ref-type="table" rid="table2">Table 2</xref>, the mean sensitivity and specificity at the point of equal error across the 4 classifications was stable for ML model performance. Across the 4 PHQ-8 cutoff scores, sensitivity and specificity values ranged from 0.72 at a cutoff of 10, 15, and 20 for the ML model test on the Blind Set to 0.76 for the ML model test on the Dev Set at a cutoff of 20 and on the Blind Set at a cutoff of 5. As observed with AUROC, values at the lower end of the range for sensitivity and specificity may have been affected by smaller subsample sizes (eg, Blind Set with PHQ-8 &#x2265;20).</p></sec><sec id="s3-4"><title>Model Performance Across Metadata Subgroups</title><p>ML model performance was evaluated across metadata subgroups based on age in years (18&#x2010;39, 40&#x2010;64, and &#x2265;65), sex (male and female), and BH case management versus non-BH case management and across the 4 SVI levels (1=least vulnerable and 4=most vulnerable).</p><p>The ML model performance between the 2 datasets (Dev Set and Blind Set) within their subgroups (age [<xref ref-type="table" rid="table3">Table 3</xref>], sex [<xref ref-type="table" rid="table4">Table 4</xref>], type of case management call [<xref ref-type="table" rid="table5">Table 5</xref>], and SVI [<xref ref-type="table" rid="table6">Table 6</xref>]) reveals both consistent and relatively similar AUROC cutoff at 10 and EER values with AUROC cutoff at 10 ranging from 0.81 to 0.83 and sensitivity and specificity at point of equal error ranging from 0.73 to 0.75, implying good model stability and robustness. See Figures S1-S12 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for ROC curves (overall and per subgroup on Blind Set). CCC ranged from 0.44 to 0.61, with the lowest (0.44) occurring in the most highly socially vulnerable group in the Blind Set and the highest (0.61) occurring in both the least socially vulnerable group of the Dev Set and the &#x2265;65 years age group in the Blind Set. In most cases, the lower CCC values occurred where sample sizes were approximately 100 or fewer individuals, and our previous work [<xref ref-type="bibr" rid="ref33">33</xref>] suggests a minimum count of approximately 200 individuals for robust estimates of prediction performance. MAE values ranged from 3.62 in the &#x2265;65 years age group of the Blind Set to 4.57 in the non-BH group of the Blind Set. The 2 highest MAE values were associated with subgroups with sample sizes of approximately 100 or fewer participants, comparable with results for the CCC.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Regression and classification metrics for model tests by the subgroup age.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2"/><td align="left" valign="bottom" colspan="3">Dev Set (n=1336)</td><td align="left" valign="bottom" colspan="3">Blind Set (n=671)</td></tr><tr><td align="char" char="." valign="top">Aged &#x2264;39 years (n=296)</td><td align="char" char="." valign="top">Aged 40&#x2010;64 years (n=704)</td><td align="char" char="." valign="top">Aged &#x2265;65 years (n=336)</td><td align="left" valign="top">Aged &#x2264;39 years (n=183)</td><td align="left" valign="top">Aged 40&#x2010;64 years (n=344)</td><td align="left" valign="top">Aged &#x2265;65 years (n=144)</td></tr></thead><tbody><tr><td align="left" valign="top">CCC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.47</td><td align="left" valign="top">0.61</td></tr><tr><td align="left" valign="top">MAE<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">3.91</td><td align="left" valign="top">4.00</td><td align="char" char="." valign="top">3.77</td><td align="char" char="." valign="top">3.93</td><td align="char" char="." valign="top">4.32</td><td align="char" char="." valign="top">3.62</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> cutoff<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> 10</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.83</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td></tr><tr><td align="left" valign="top">Sensitivity and specificity<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup> cutoff 10</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.75</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.72</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table3fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn><fn id="table3fn3"><p><sup>c</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table3fn4"><p><sup>d</sup>Cutoff thresholds correspond to the boundaries of clinical depression severity classes.</p></fn><fn id="table3fn5"><p><sup>e</sup>Value of both sensitivity and specificity at point of equal error.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Regression and classification metrics for model tests by the subgroup sex.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" rowspan="2"/><td align="left" valign="top" colspan="2">Dev Set (n=1336)</td><td align="left" valign="top" colspan="2">Blind Set (n=671)</td></tr><tr><td align="left" valign="top">Female (n=910)</td><td align="left" valign="top">Male (n=426)</td><td align="left" valign="top">Female (n=462)</td><td align="left" valign="top">Male (n=207)</td></tr></thead><tbody><tr><td align="left" valign="top">CCC<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="char" char="." valign="top">0.56</td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.53</td><td align="left" valign="top">0.54</td></tr><tr><td align="left" valign="top">MAE<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">3.86</td><td align="char" char="." valign="top">4.04</td><td align="char" char="." valign="top">3.95</td><td align="char" char="." valign="top">4.29</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> cutoff<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> 10</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td></tr><tr><td align="left" valign="top">Sensitivity and specificity<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> cutoff 10</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.72</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table4fn2"><p><sup>b</sup>MAE: mean absolute error.</p></fn><fn id="table4fn3"><p><sup>c</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table4fn4"><p><sup>d</sup>Cutoff thresholds correspond to the boundaries of clinical depression severity classes.</p></fn><fn id="table4fn5"><p><sup>e</sup>Value of both sensitivity and specificity at point of equal error.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Regression and classification metrics for model tests by the subgroup type of case management.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" rowspan="2"/><td align="left" valign="top" colspan="2">Dev Set (n=1336)</td><td align="left" valign="top" colspan="2">Blind Set (n=671)</td></tr><tr><td align="left" valign="top">BH<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> (n=1087)</td><td align="left" valign="top">Non-BH (n=249)</td><td align="left" valign="top">BH (n=561)</td><td align="left" valign="top">Non-BH (n=110)</td></tr></thead><tbody><tr><td align="left" valign="top">CCC<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="char" char="." valign="top">0.57</td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.46</td></tr><tr><td align="left" valign="top">MAE<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="char" char="." valign="top">3.92</td><td align="char" char="." valign="top">3.86</td><td align="char" char="." valign="top">4.00</td><td align="char" char="." valign="top">4.40</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup> cutoff<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup> 10</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td></tr><tr><td align="left" valign="top">Sensitivity and specificity<sup><xref ref-type="table-fn" rid="table5fn6">f</xref></sup> cutoff 10</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.72</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>BH: behavioral health.</p></fn><fn id="table5fn2"><p><sup>b</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table5fn3"><p><sup>c</sup>MAE: mean absolute error.</p></fn><fn id="table5fn4"><p><sup>d</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table5fn5"><p><sup>e</sup>Cutoff thresholds correspond to the boundaries of clinical depression severity classes.</p></fn><fn id="table5fn6"><p><sup>f</sup>Value of both sensitivity and specificity at point of equal error.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Regression and classification metrics for model tests by the subgroup Social Vulnerability Index (SVI).</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" rowspan="2"/><td align="left" valign="top" colspan="4">Dev Set (n=1336)</td><td align="left" valign="top" colspan="4">Blind Set (n=671)</td></tr><tr><td align="char" char="." valign="top">SVI<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>=1 (n=279)</td><td align="char" char="." valign="top">SVI=2 (n=617)</td><td align="char" char="." valign="top">SVI=3 (n=335)</td><td align="char" char="." valign="top">SVI=4 (n=102)</td><td align="char" char="." valign="top">SVI=1 (1n=50)</td><td align="left" valign="top">SVI=2 (n=306)</td><td align="left" valign="top">SVI=3 (n=162)</td><td align="left" valign="top">SVI=4 (n=51)</td></tr></thead><tbody><tr><td align="left" valign="top">CCC<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.57</td><td align="char" char="." valign="top">0.57</td><td align="char" char="." valign="top">0.46</td><td align="char" char="." valign="top">0.61</td><td align="char" char="." valign="top">0.54</td><td align="char" char="." valign="top">0.47</td><td align="char" char="." valign="top">0.44</td></tr><tr><td align="left" valign="top">MAE<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="char" char="." valign="top">3.76</td><td align="char" char="." valign="top">3.90</td><td align="char" char="." valign="top">3.98</td><td align="char" char="." valign="top">4.24</td><td align="char" char="." valign="top">3.83</td><td align="char" char="." valign="top">4.04</td><td align="char" char="." valign="top">4.18</td><td align="char" char="." valign="top">4.57</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup> cutoff<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup> 10</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.83</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td><td align="char" char="." valign="top">0.81</td></tr><tr><td align="left" valign="top">Sensitivity and specificity<sup><xref ref-type="table-fn" rid="table6fn6">f</xref></sup> cutoff 10</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.72</td><td align="char" char="." valign="top">0.72</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>SVI: Social Vulnerability Index (1=least vulnerable, 4=most vulnerable).</p></fn><fn id="table6fn2"><p><sup>b</sup>CCC: concordance correlation coefficient.</p></fn><fn id="table6fn3"><p><sup>c</sup>MAE: mean absolute error.</p></fn><fn id="table6fn4"><p><sup>d</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table6fn5"><p><sup>e</sup>Cutoff thresholds correspond to the boundaries of clinical depression severity classes.</p></fn><fn id="table6fn6"><p><sup>f</sup>Value of both sensitivity and specificity at point of equal error.</p></fn></table-wrap-foot></table-wrap><p>Finally, within the Dev Set, 3.1% (42/1336) of recordings showed sizable discrepancies (divergence equivalent to 2 or more PHQ-8 categories) between administered PHQ-8 and ML-predicted depression score that could imply PHQ-8 response underreporting (actual depression score much lower than predicted depression score) or overreporting (actual depression score much higher than predicted depression score). A review of these discrepancies by 5 independently licensed clinicians, who were blinded to the administered score, yielded PHQ-8 categorization of members&#x2019; vocalizations that were consistent with the ML model categorization twice as often as they were with the administered PHQ-8 score.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>The current evaluation, leveraging speech analysis to detect depression symptoms across different levels of severity within a large real-world clinical case management context, represents, to our knowledge, the first evaluation of its kind. Overall, the findings for the combined acoustic-semantic ML model demonstrate strong model performance across a variety of key metrics. The AUROC value of approximately 0.81, the overall CCC value of 0.54, and mean of the sensitivity and specificity at the EER of 0.73 in the Blind Set demonstrate robust clinical support for the model&#x2019;s ability to accurately predict severity of depression. These results compare favorably to previous research, which has primarily relied on much smaller samples and incorporated pilot, simulation, or controlled study designs [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Whereas many prior studies have focused on application of acoustic or semantic speech analysis, this study reports on an approach that combined information from both semantic and acoustic-based models. Future development and testing of speech analysis models should continue to explore the benefits of combining acoustic and semantic models.</p><p>It is noteworthy that the model performed consistently across all major subgroups and PHQ-8 classification levels, with AUROC values ranging from 0.79 to 0.85 and CCC values ranging from 0.44 to 0.61. However, the model did undercharacterize individuals at the highest PHQ-8 level (ie, &#x201C;severe&#x201D; depression), likely due to smaller sample size in this category of depression on which this model was trained. Among the most promising findings from the subgroup analyses was the model&#x2019;s strong accuracy in predicting depression severity among older adults across key metrics. This finding is particularly significant, given that older adults have the highest rates of undetected depression [<xref ref-type="bibr" rid="ref34">34</xref>], are often less likely to recognize or self-report symptoms of depression [<xref ref-type="bibr" rid="ref35">35</xref>], and may experience depression with fewer dysphoric symptoms and more somatic complaints, which can be misattributed to physical illness [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>].</p><p>The model&#x2019;s ability to detect depression symptoms at lower severity levels offers significant real-world potential for early identification and person-fit and right-sized interventions earlier in individuals&#x2019; clinical trajectories. In addition to its ability to classify depression presence (PHQ-8&#x003C;10 vs &#x2265;10), the model performed well across specific PHQ-8 severity levels, particularly in the minimal and mild ranges. This suggests promising applications for early, proactive, cost-effective, and lower-intensity interventions (eg, digital interventions, BH coaching, and peer or social support) that may prevent symptom progression and reduce relapse rates. Notably, beyond the personal and clinical benefits, earlier interventions and prevention of depression may also have significant financial implications, potentially reducing health care costs associated with advanced-stage depression treatment.</p><p>Beyond BH contexts, the model&#x2019;s performance in general medical (non-BH) case management calls suggests even greater potential for broadening depression detection. The model achieved a comparable level of performance (AUROC cutoff 10=0.81; range=0.79-0.85) in non-BH case manager calls, indicating potential for integrating depression detection into clinical decision-making in settings where the PHQ-9 is not routinely administered and where depression is often undetected [<xref ref-type="bibr" rid="ref38">38</xref>]. Furthermore, the potential financial impact of enhanced depression detection in non-BH contexts is considerable, especially given that the total cost of care for members with both a BH condition and a chronic physical health condition, experienced by many members in medical-surgical case management, is approximately 3 times higher than for those with the same condition but no BH diagnosis [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. The foregoing, notwithstanding, findings related to model performance in non-BH case management should be considered preliminary in light of the smaller sample size (n=110, Blind Set sample). Accordingly, additional application of speech analysis in non-BH contexts and other physical health settings is warranted.</p><p>The findings with respect to the overreporting and underreporting on the PHQ-8 offer insights into the potential for speech-based analysis to enhance depression detection objectivity [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]. Specifically, speech analysis may be less susceptible to bias, whether conscious or unconscious, relative to subjective report or traditional measurement. The observed trend of likely underreporting on the PHQ-8 (ie, the administered score being much lower than the predicted score) aligns with prior research on the impact of stigma, lower BH literacy, and other factors that contribute to self-report bias and underreporting of depression [<xref ref-type="bibr" rid="ref44">44</xref>]. On the other hand, overreporting on the PHQ-9 (ie, the administered score being much higher than the predicted score) may suggest heightened or exaggerated response tendencies, personality characteristics, or efforts to obtain help [<xref ref-type="bibr" rid="ref45">45</xref>].</p><p>Finally, this study highlights important opportunities for speech analysis and other digital phenotyping approaches to improve administrative and clinical workflows. It is noteworthy that case managers spent nearly 20% of total call time administering the PHQ-9. From an efficiency standpoint, this is time that could be better allocated to establishing a therapeutic alliance, collaborating to identify and define BH goals, addressing ambivalence and other potential obstacles to achieving those goals, and directly addressing the member&#x2019;s chief concerns. Greater efficiency also establishes opportunity for case managers to interact with more members. From a clinical process standpoint, time spent administering measures such as the PHQ-9 can be awkward and even frustrating for members and may adversely affect rapport and engagement. Furthermore, inconsistencies in PHQ-9 administration can introduce errors or variability in measurement, potentially leading to misinterpretation of symptoms. In contrast, having objective, real-time data on depression severity could provide valuable insights for clinical decision-making and for providing proactive and personalized treatment plan.</p></sec><sec id="s4-2"><title>Strengths and Limitations</title><p>This study has several key strengths, including its large sample size, evaluation of speech analysis in a real-world clinical context, use of naturalistic conversations versus analog speech tasks (such as reading-defined passages or phrases, or repeating specified sounds), integration of both semantic and acoustic properties of speech, and analysis of model performance across subgroups and depression severity levels using numerous evaluation metrics. At the same time, there are several limitations that should be considered when interpreting the current findings and guiding future research.</p><p>While the large sample size included in this real-world evaluation is unique in the field of digital phenotyping [<xref ref-type="bibr" rid="ref15">15</xref>], the sample sizes for some of the subanalyses, including the analyses of the non-BH calls and the highest PHQ-9 severity level (&#x201C;Severe&#x201D;) condition, were relatively small. Given this, caution should be exercised when interpreting these findings. Moreover, data on member race and ethnicity were not available. As such, the generalizability of the current results to different ethnic, cultural, and linguistic groups cannot be definitively determined. That said, the acoustic-semantic speech model was developed and trained on a very large and diverse sample [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref46">46</xref>].</p><p>Furthermore, prediction of depression by the model, like with any ML model, includes a degree of error or imprecision. In the current analysis, this was equivalent to approximately 4.06 points on average (the reported MAE) on the PHQ-9, which itself has imperfect accuracy [<xref ref-type="bibr" rid="ref47">47</xref>]. As such, predicted scores should be interpreted with this in mind. With additional data, precision is likely to further increase.</p><p>In addition, the collection of the case management recordings on a single audio channel posed a challenge for this study, necessitating the use of ASR for conversion of speech to text, diarization for speaker separation, and speaker attribution labeling. While these processes generally have low error rates, they are not entirely error-free. Diarization was performed using a leading commercial tool as manual processing of calls would have required listening to and annotating thousands of hours of recordings, a time-intensive process that is also prone to errors. Additionally, the use of automatic speech processing better reflects how an actual implementation would be performed in a real-world setting. However, diarization errors (ie, poor or missing speaker separation) were encountered, and these errors propagated through the preprocessing and annotation pipeline, affecting both automated speaker attribution and removal of PHQ-8 content.</p><p>Furthermore, data curation errors observed (eg, inclusion of voicemails and conversations with minors), inevitable in a real-world dataset of this kind, may have impacted performance (in both positive and negative directions); on balance, however, these likely did not have more than a negligible effect on the reported performance metrics. Many of these challenges and resulting errors may be attenuated in prospective implementations (vs the current retrospective-focused analysis) in the future, given that (1) current call management systems routinely record speakers on separate channels, significantly mitigating the challenges of diarization (many legacy systems do not have diarization capability but some may be specially configured to do so), and (2) formal implementation of this technology within the case manager&#x2019;s workflow would limit inclusion of irrelevant (eg, voicemail messages) or inappropriate (eg, minors and different languages) calls through either call management software technology or manual exclusion by the case manager according to inclusion and exclusion criteria.</p></sec><sec id="s4-3"><title>Future Studies and Real-World Deployments</title><p>The successful deployment of AI-driven speech analysis for depression detection requires careful integration into existing health care workflows. One promising approach is its integration into telehealth platforms, where it could facilitate real-time assessment and early detection during virtual consultations. Embedding the model into electronic health records, virtual scribe technology or clinical decision support systems could further enhance its use by providing clinicians with objective, data-driven insights alongside traditional assessments. This study represents a step toward the rigorous validation of AI-based health care tools, ensuring their accuracy and reliability across diverse populations. For successful deployment, ensuring security, safety, and compliance with Health Insurance Portability and Accountability Act, General Data Protection Regulation, and other data protection regulations is essential, along with continuous monitoring of system performance on test datasets to maintain reliability and accuracy. Additionally, clinician adoption depends on ensuring that the tool is user-friendly and seamlessly integrates into existing workflows without adding unnecessary burden.</p><p>Although the data used in this study are deidentified, future studies and real-world deployments should incorporate a protocol for obtaining explicit informed consent from members, ensuring ethical transparency and alignment with established guidelines for digital health interventions. One of the primary challenges in ML-based depression detection is the mitigation of bias, as algorithmic outputs may be influenced by dataset imbalances or systemic biases. In this study, bias evaluation was conducted across key demographic subgroups, but future research should expand on bias mitigation strategies and assess ethical AI deployment frameworks to ensure equitable model performance across diverse populations.</p><p>Additionally, AI governance is a critical factor in real-world deployment, necessitating adherence to key principles such as transparency, fairness, and accountability. Transparency ensures that AI models operate in a manner that is understandable, explainable, and accessible to stakeholders, including clinicians and patients. Fairness requires that models are developed and validated in a way that minimizes bias and ensures equitable performance across diverse populations, preventing disparities in mental health assessments. Accountability involves establishing clear oversight mechanisms to monitor AI decision-making, ensuring that these technologies align with ethical standards, regulatory requirements, and best practices for patient care. Future research and implementation should prioritize these principles to foster trust and reliability in AI-driven mental health tools.</p><p>Finally, while the ML model demonstrated strong predictive performance, it is important to emphasize that this tool is intended for initial assessment and triage and not for medical diagnosis. The model is designed to support early identification and risk stratification, which should be followed by clinician evaluation and judgment. This tool is not intended to replace traditional diagnostic methods.</p></sec><sec id="s4-4"><title>Conclusions</title><p>There is an urgent need to enhance detection and measurement of depression. Implementing digital phenotyping through the use of speech as a digital biomarker of depression offers significant promise for improving and accelerating depression identification and treatment. In short, the current evaluation, involving the examination of combined acoustic and semantic speech analysis for predicting depression symptom severity across PHQ-9 classification levels in a large real-world clinical context, represents the first evaluation of its type. The results reported herein provide strong support for the application and use of a readily available and unobtrusive biomarker, namely, what and how of spoken language, for detecting and measuring depression in real-world practice at this important time. This easily accessible biomarker has significant potential for application in health care settings, ranging from &#x201C;preclinical&#x201D; case management contexts to patient-provider interactions. It is hoped that the current findings help to advance the development and application of novel ML technologies for automating and enhancing depression symptom measurement and for informing and advancing clinical decision-making, next-best actions, and personalized treatment recommendations. Moving analysis of speech for the detection of depression symptoms&#x2014;not long ago deemed science fiction&#x2014;to clinical reality presents considerable opportunities for changing the paradigm of BH care to be more efficient, personalized, proactive, and upstream-focused.</p></sec></sec></body><back><ack><p>This project was supported by the Richard King Mellon Foundation (grant 10714). The authors would like to acknowledge the Ellipsis machine learning and data analysis contributions of Piotr Chlebek, Tomasz Rutowski, Amir Harati, Tulio Goulart, Robert Rozanski and Yang Lu; the generous contributions of Farshid Haque and Tahmida Nazreen for proofreading and formatting of the manuscript, and of Marija Stanojevic for comments on the machine learning portions of the manuscript. The authors would also like to acknowledge Nina Roth for her extensive support in manuscript preparation.</p></ack><notes><sec><title>Data Availability</title><p>The datasets presented in this paper are not readily available because many that support findings for this study are proprietary. Requests to access the datasets should be directed to MA, mike@ellipsishealth.com.</p></sec></notes><fn-group><fn fn-type="con"><p>BK, RA, MA, ES, and MH participated in conceptualization, methodology, investigation, and writing&#x2014;original draft; BK, MA, ES, MH, DH, and RA participated in supervision; and BK, DH, RA, MA, and ES participated in writing&#x2014;review and editing.</p></fn><fn fn-type="conflict"><p>MA, ES, and MH are affiliated with Ellipsis Health, a for-profit health care technology company whose algorithms and analytic services were used in this study.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ASR</term><def><p>automatic speech recognition</p></def></def-item><def-item><term id="abb2">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb3">BH</term><def><p>behavioral health</p></def></def-item><def-item><term id="abb4">CCC</term><def><p>concordance correlation coefficient</p></def></def-item><def-item><term id="abb5">EER</term><def><p> equal error rate</p></def></def-item><def-item><term id="abb6">MAE</term><def><p>mean absolute error</p></def></def-item><def-item><term id="abb7">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb8">PHQ-8</term><def><p>Patient Health Questionnaire&#x2014;8 questions</p></def></def-item><def-item><term id="abb9">PHQ-9</term><def><p>Patient Health Questionnaire&#x2014;9 questions</p></def></def-item><def-item><term id="abb10">SFTP</term><def><p>secure file transfer protocol</p></def></def-item><def-item><term id="abb11">SVI</term><def><p>Social Vulnerability Index</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>2021 NSDUH Annual National Report</article-title><source>Substance Abuse and Mental Health Services Administration</source><year>2021</year><access-date>2024-06-24</access-date><publisher-name>CBHSQ Data</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.samhsa.gov/data/report/2021-nsduh-annual-national-report">https://www.samhsa.gov/data/report/2021-nsduh-annual-national-report</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vahratian</surname><given-names>A</given-names> </name><name name-style="western"><surname>Blumberg</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Terlizzi</surname><given-names>EP</given-names> </name><name name-style="western"><surname>Schiller</surname><given-names>JS</given-names> </name></person-group><article-title>Symptoms of anxiety or depressive disorder and use of mental health care among adults during the COVID-19 pandemic&#x2014;United States, August 2020&#x2013;February 2021</article-title><source>MMWR Morb Mortal Wkly Rep</source><volume>70</volume><issue>13</issue><fpage>490</fpage><lpage>494</lpage><pub-id pub-id-type="doi">10.15585/mmwr.mm7013e2</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ettman</surname><given-names>CK</given-names> </name><name name-style="western"><surname>Abdalla</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>GH</given-names> </name><name name-style="western"><surname>Sampson</surname><given-names>L</given-names> </name><name name-style="western"><surname>Vivier</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Galea</surname><given-names>S</given-names> </name></person-group><article-title>Prevalence of depression symptoms in US adults before and during the COVID-19 pandemic</article-title><source>JAMA Netw Open</source><year>2020</year><month>09</month><day>1</day><volume>3</volume><issue>9</issue><fpage>e2019686</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.19686</pub-id><pub-id pub-id-type="medline">32876685</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Demler</surname><given-names>O</given-names> </name><name name-style="western"><surname>Kessler</surname><given-names>RC</given-names> </name></person-group><article-title>Adequacy of treatment for serious mental illness in the United States</article-title><source>Am J Public Health</source><year>2002</year><month>01</month><volume>92</volume><issue>1</issue><fpage>92</fpage><lpage>98</lpage><pub-id pub-id-type="doi">10.2105/ajph.92.1.92</pub-id><pub-id pub-id-type="medline">11772769</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Lovett</surname><given-names>L</given-names> </name></person-group><article-title>Highmark health&#x2019;s behavioral health director: personalized care, upstream prevention will define the industry</article-title><source>BH Business</source><year>2023</year><month>04</month><day>10</day><access-date>2024-06-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://bhbusiness.com/2023/04/10/highmark-healths-behavioral-health-director-personalized-care-upstream-prevention-will-define-the-industry/">https://bhbusiness.com/2023/04/10/highmark-healths-behavioral-health-director-personalized-care-upstream-prevention-will-define-the-industry/</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fahed</surname><given-names>M</given-names> </name><name name-style="western"><surname>McManus</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vahia</surname><given-names>IV</given-names> </name><name name-style="western"><surname>Offodile</surname><given-names>AC</given-names>  <suffix>II</suffix></name></person-group><article-title>Digital phenotyping of behavioral symptoms as the next frontier for personalized and proactive cancer care</article-title><source>JCO Clin Cancer Inform</source><year>2022</year><month>10</month><volume>6</volume><issue>6</issue><fpage>e2200095</fpage><pub-id pub-id-type="doi">10.1200/CCI.22.00095</pub-id><pub-id pub-id-type="medline">36265113</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huckvale</surname><given-names>K</given-names> </name><name name-style="western"><surname>Venkatesh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Christensen</surname><given-names>H</given-names> </name></person-group><article-title>Toward clinical digital phenotyping: a timely opportunity to consider purpose, quality, and safety</article-title><source>NPJ Digit Med</source><year>2019</year><volume>2</volume><fpage>88</fpage><pub-id pub-id-type="doi">10.1038/s41746-019-0166-1</pub-id><pub-id pub-id-type="medline">31508498</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Insel</surname><given-names>TR</given-names> </name></person-group><article-title>Digital phenotyping: technology for a new science of behavior</article-title><source>JAMA</source><year>2017</year><month>10</month><day>3</day><volume>318</volume><issue>13</issue><fpage>1215</fpage><lpage>1216</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.11295</pub-id><pub-id pub-id-type="medline">28973224</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohr</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Shilton</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hotopf</surname><given-names>M</given-names> </name></person-group><article-title>Digital phenotyping, behavioral sensing, or personal sensing: names and transparency in the digital age</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><fpage>45</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-0251-5</pub-id><pub-id pub-id-type="medline">32219186</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chia</surname><given-names>AZR</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>MWB</given-names> </name></person-group><article-title>Digital phenotyping in psychiatry: a scoping review</article-title><source>Technol Health Care</source><year>2022</year><volume>30</volume><issue>6</issue><fpage>1331</fpage><lpage>1342</lpage><pub-id pub-id-type="doi">10.3233/THC-213648</pub-id><pub-id pub-id-type="medline">35661034</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herrman</surname><given-names>H</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kieling</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Time for united action on depression: a Lancet&#x2013;World Psychiatric Association Commission</article-title><source>Lancet</source><year>2022</year><month>03</month><volume>399</volume><issue>10328</issue><fpage>957</fpage><lpage>1022</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(21)02141-3</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Galatzer-Levy</surname><given-names>IR</given-names> </name><name name-style="western"><surname>Onnela</surname><given-names>JP</given-names> </name></person-group><article-title>Machine learning and the digital measurement of psychological health</article-title><source>Annu Rev Clin Psychol</source><year>2023</year><month>05</month><day>9</day><volume>19</volume><fpage>133</fpage><lpage>154</lpage><pub-id pub-id-type="doi">10.1146/annurev-clinpsy-080921-073212</pub-id><pub-id pub-id-type="medline">37159287</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Low</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Bentley</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>SS</given-names> </name></person-group><article-title>Automated assessment of psychiatric disorders using speech: a systematic review</article-title><source>Laryngoscope Investig Otolaryngol</source><year>2020</year><month>02</month><volume>5</volume><issue>1</issue><fpage>96</fpage><lpage>116</lpage><pub-id pub-id-type="doi">10.1002/lio2.354</pub-id><pub-id pub-id-type="medline">32128436</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koutsouleris</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hauser</surname><given-names>TU</given-names> </name><name name-style="western"><surname>Skvortsova</surname><given-names>V</given-names> </name><name name-style="western"><surname>De Choudhury</surname><given-names>M</given-names> </name></person-group><article-title>From promise to practice: towards the realisation of AI-informed mental health care</article-title><source>Lancet Digit Health</source><year>2022</year><month>11</month><volume>4</volume><issue>11</issue><fpage>e829</fpage><lpage>e840</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(22)00153-4</pub-id><pub-id pub-id-type="medline">36229346</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flanagan</surname><given-names>O</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Roop</surname><given-names>P</given-names> </name><name name-style="western"><surname>Sundram</surname><given-names>F</given-names> </name></person-group><article-title>Using acoustic speech patterns from smartphones to investigate mood disorders: scoping review</article-title><source>JMIR Mhealth Uhealth</source><year>2021</year><month>09</month><day>17</day><volume>9</volume><issue>9</issue><fpage>e24352</fpage><pub-id pub-id-type="doi">10.2196/24352</pub-id><pub-id pub-id-type="medline">34533465</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>L</given-names> </name></person-group><article-title>Automatic depression detection: an emotional audio-textual corpus and a GRU/bilstm-based model</article-title><conf-name>ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><conf-loc>Singapore, Singapore</conf-loc><publisher-name>IEEE</publisher-name><fpage>6247</fpage><lpage>6251</lpage><pub-id pub-id-type="doi">10.1109/ICASSP43922.2022.9746569</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Cox</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Le</surname><given-names>TP</given-names> </name><etal/></person-group><article-title>Using machine learning of computerized vocal expression to measure blunted vocal affect and alogia</article-title><source>NPJ Schizophr</source><year>2020</year><month>09</month><day>25</day><volume>6</volume><issue>1</issue><fpage>26</fpage><pub-id pub-id-type="doi">10.1038/s41537-020-00115-2</pub-id><pub-id pub-id-type="medline">32978400</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El-Den</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Gan</surname><given-names>YL</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>E</given-names> </name><name name-style="western"><surname>O&#x2019;Reilly</surname><given-names>CL</given-names> </name></person-group><article-title>The psychometric properties of depression screening tools in primary healthcare settings: a systematic review</article-title><source>J Affect Disord</source><year>2018</year><month>01</month><day>1</day><volume>225</volume><fpage>503</fpage><lpage>522</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2017.08.060</pub-id><pub-id pub-id-type="medline">28866295</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kroenke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Spitzer</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>JB</given-names> </name></person-group><article-title>The PHQ-9: validity of a brief depression severity measure</article-title><source>J Gen Intern Med</source><year>2001</year><month>09</month><volume>16</volume><issue>9</issue><fpage>606</fpage><lpage>613</lpage><pub-id pub-id-type="doi">10.1046/j.1525-1497.2001.016009606.x</pub-id><pub-id pub-id-type="medline">11556941</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kroenke</surname><given-names>K</given-names> </name><name name-style="western"><surname>Spitzer</surname><given-names>RL</given-names> </name></person-group><article-title>The PHQ-9: a new depression diagnostic and severity measure</article-title><source>Psychiatr Ann</source><year>2002</year><month>09</month><volume>32</volume><issue>9</issue><fpage>509</fpage><lpage>515</lpage><pub-id pub-id-type="doi">10.3928/0048-5713-20020901-06</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rutowski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shriberg</surname><given-names>E</given-names> </name><name name-style="western"><surname>Harati</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Oliveira</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chlebek</surname><given-names>P</given-names> </name></person-group><article-title>Cross-demographic portability of deep NLP-based depression models</article-title><conf-name>2021 IEEE Spoken Language Technology Workshop (SLT)</conf-name><conf-loc>Shenzhen, China</conf-loc><publisher-name>IEEE</publisher-name><fpage>1052</fpage><lpage>1057</lpage><pub-id pub-id-type="doi">10.1109/SLT48900.2021.9383609</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Harati</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shriberg</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rutowski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chlebek</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Oliveira</surname><given-names>R</given-names> </name></person-group><article-title>Speech-based depression prediction using encoder-weight-only transfer learning and a large corpus</article-title><year>2021</year><conf-name>ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><conf-loc>Toronto, ON, Canada</conf-loc><publisher-name>IEEE</publisher-name><fpage>7273</fpage><lpage>7277</lpage><pub-id pub-id-type="doi">10.1109/ICASSP39728.2021.9414208</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Nazreen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rutowski</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Feasibility of a machine learning-based smartphone application in detecting depression and anxiety in a generally senior population</article-title><source>Front Psychol</source><year>2022</year><volume>13</volume><fpage>811517</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2022.811517</pub-id><pub-id pub-id-type="medline">35478769</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Baevski</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Auli</surname><given-names>M</given-names> </name></person-group><article-title>Wav2vec 2.0: a framework for self-supervised learning of speech representations</article-title><source>Adv Neural Inf Process Syst</source><comment>Preprint posted online on 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2006.11477</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><article-title>Longformer: the long-document transformer</article-title><source>arXiv</source><comment>Preprint posted online on 2020</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2004.05150">https://arxiv.org/abs/2004.05150</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><article-title>Speech to text&#x2014;Amazon Transcribe&#x2014;AWS</article-title><source>Amazon Web Services, Inc</source><year>2024</year><month>06</month><day>24</day><access-date>2024-06-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/transcribe/">https://aws.amazon.com/transcribe/</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Introducing medical language processing with Amazon Comprehend Medical | AWS ML blog</article-title><source>Amazon Web Services, Inc</source><year>2018</year><month>11</month><day>27</day><access-date>2024-06-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aws.amazon.com/blogs/machine-learning/introducing-medical-language-processing-with-amazon-comprehend-medical/">https://aws.amazon.com/blogs/machine-learning/introducing-medical-language-processing-with-amazon-comprehend-medical/</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Fara</surname><given-names>S</given-names> </name><name name-style="western"><surname>Goria</surname><given-names>S</given-names> </name><name name-style="western"><surname>Molimpakis</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cummins</surname><given-names>N</given-names> </name></person-group><article-title>Speech and the n-Back task as a lens into depression. How combining both may allow us to isolate different core symptoms of depression</article-title><source>Interspeech 2022</source><year>2022</year><publisher-name>ISCA</publisher-name><fpage>1911</fpage><lpage>1915</lpage><pub-id pub-id-type="doi">10.48550/arXiv.2204.00088</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Seneviratne</surname><given-names>N</given-names> </name><name name-style="western"><surname>Espy-Wilson</surname><given-names>C</given-names> </name></person-group><article-title>Multimodal depression severity score prediction using articulatory coordination features and hierarchical attention based text embeddings</article-title><year>2022</year><conf-name>Interspeech 2022</conf-name><fpage>3353</fpage><lpage>3357</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2022-11099</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ravi</surname><given-names>V</given-names> </name><name name-style="western"><surname>Flint</surname><given-names>J</given-names> </name><name name-style="western"><surname>Alwan</surname><given-names>A</given-names> </name></person-group><article-title>Unsupervised instance discriminative learning for depression detection from speech signals</article-title><year>2022</year><conf-name>Interspeech 2022</conf-name><fpage>2018</fpage><lpage>2022</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2022-10814</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>Social Vulnerability Index (CDC/ATSDR SVI)</article-title><source>CDC/ATSDR</source><year>2024</year><month>06</month><day>14</day><access-date>2024-06-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.atsdr.cdc.gov/place-health/php/svi/?CDC_AAref_Val=https://www.atsdr.cdc.gov/placeandhealth/svi/index.html">https://www.atsdr.cdc.gov/place-health/php/svi/?CDC_AAref_Val=https://www.atsdr.cdc.gov/placeandhealth/svi/index.html</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bn</surname><given-names>S</given-names> </name><name name-style="western"><surname>Abdullah</surname><given-names>S</given-names> </name></person-group><article-title>Privacy sensitive speech analysis using federated learning to assess depression</article-title><year>2022</year><conf-name>ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</conf-name><publisher-name>IEEE</publisher-name><fpage>6272</fpage><lpage>6276</lpage><pub-id pub-id-type="doi">10.1109/ICASSP43922.2022.9746827</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rutowski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harati</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shriberg</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chlebek</surname><given-names>P</given-names> </name><name name-style="western"><surname>Oliveira</surname><given-names>R</given-names> </name></person-group><article-title>Toward corpus size requirements for training and evaluating depression risk models using spoken language</article-title><year>2022</year><conf-name>Interspeech 2022</conf-name><fpage>3343</fpage><lpage>3347</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2022-10888</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zenebe</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Akele</surname><given-names>B</given-names> </name><name name-style="western"><surname>W/Selassie</surname><given-names>M</given-names> </name><name name-style="western"><surname>Necho</surname><given-names>M</given-names> </name></person-group><article-title>Prevalence and determinants of depression among old age: a systematic review and meta-analysis</article-title><source>Ann Gen Psychiatry</source><year>2021</year><month>12</month><day>18</day><volume>20</volume><issue>1</issue><fpage>55</fpage><pub-id pub-id-type="doi">10.1186/s12991-021-00375-x</pub-id><pub-id pub-id-type="medline">34922595</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Devita</surname><given-names>M</given-names> </name><name name-style="western"><surname>De Salvo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ravelli</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Recognizing depression in the elderly: practical guidance and challenges for clinical management</article-title><source>Neuropsychiatr Dis Treat</source><year>2022</year><volume>18</volume><fpage>2867</fpage><lpage>2880</lpage><pub-id pub-id-type="doi">10.2147/NDT.S347356</pub-id><pub-id pub-id-type="medline">36514493</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gottfries</surname><given-names>CG</given-names> </name></person-group><article-title>Is there a difference between elderly and younger patients with regard to the symptomatology and aetiology of depression?</article-title><source>Int Clin Psychopharmacol</source><year>1998</year><month>09</month><volume>13 Suppl 5</volume><fpage>S13</fpage><lpage>S18</lpage><pub-id pub-id-type="doi">10.1097/00004850-199809005-00004</pub-id><pub-id pub-id-type="medline">9817615</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hegeman</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Kok</surname><given-names>RM</given-names> </name><name name-style="western"><surname>van der Mast</surname><given-names>RC</given-names> </name><name name-style="western"><surname>Giltay</surname><given-names>EJ</given-names> </name></person-group><article-title>Phenomenology of depression in older compared with younger adults: meta-analysis</article-title><source>Br J Psychiatry</source><year>2012</year><month>04</month><volume>200</volume><issue>4</issue><fpage>275</fpage><lpage>281</lpage><pub-id pub-id-type="doi">10.1192/bjp.bp.111.095950</pub-id><pub-id pub-id-type="medline">22474233</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ducat</surname><given-names>L</given-names> </name><name name-style="western"><surname>Philipson</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Anderson</surname><given-names>BJ</given-names> </name></person-group><article-title>The mental health comorbidities of diabetes</article-title><source>JAMA</source><year>2014</year><month>08</month><day>20</day><volume>312</volume><issue>7</issue><fpage>691</fpage><lpage>692</lpage><pub-id pub-id-type="doi">10.1001/jama.2014.8040</pub-id><pub-id pub-id-type="medline">25010529</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Davenport</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gray</surname><given-names>T</given-names> </name><name name-style="western"><surname>Melek</surname><given-names>S</given-names> </name><collab>Milliman Research Report</collab></person-group><article-title>Milliman high-cost patient study 2020</article-title><year>2020</year><month>08</month><day>13</day><access-date>2025-05-16</access-date><volume>URL</volume><comment><ext-link ext-link-type="uri" xlink:href="https://www.milliman.com/-/media/milliman/pdfs/articles/milliman-high-cost-patient-study-2020.pdf">https://www.milliman.com/-/media/milliman/pdfs/articles/milliman-high-cost-patient-study-2020.pdf</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bellon</surname><given-names>J</given-names> </name><name name-style="western"><surname>Quinlan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>B</given-names> </name><name name-style="western"><surname>Nemecek</surname><given-names>D</given-names> </name><name name-style="western"><surname>Borden</surname><given-names>E</given-names> </name><name name-style="western"><surname>Needs</surname><given-names>P</given-names> </name></person-group><article-title>Association of outpatient behavioral health treatment with medical and pharmacy costs in the first 27 months following a new behavioral health diagnosis in the US</article-title><source>JAMA Netw Open</source><year>2022</year><month>12</month><day>1</day><volume>5</volume><issue>12</issue><fpage>e2244644</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2022.44644</pub-id><pub-id pub-id-type="medline">36472875</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Malpass</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dowrick</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gilbody</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Usefulness of PHQ-9 in primary care to determine meaningful symptoms of low mood: a qualitative study</article-title><source>Br J Gen Pract</source><year>2016</year><month>02</month><volume>66</volume><issue>643</issue><fpage>e78</fpage><lpage>e84</lpage><pub-id pub-id-type="doi">10.3399/bjgp16X683473</pub-id><pub-id pub-id-type="medline">26823268</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thombs</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Kwakkenbos</surname><given-names>L</given-names> </name><name name-style="western"><surname>Levis</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Benedetti</surname><given-names>A</given-names> </name></person-group><article-title>Addressing overestimation of the prevalence of depression based on self-report screening questionnaires</article-title><source>CMAJ</source><year>2018</year><month>01</month><day>15</day><volume>190</volume><issue>2</issue><fpage>E44</fpage><lpage>E49</lpage><pub-id pub-id-type="doi">10.1503/cmaj.170691</pub-id><pub-id pub-id-type="medline">29335262</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Jong</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Fox</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Steenkamp</surname><given-names>J</given-names> </name></person-group><article-title>Quantifying under- and overreporting in surveys through a dual-questioning-technique design</article-title><source>J Mark Res</source><year>2015</year><month>12</month><volume>52</volume><issue>6</issue><fpage>737</fpage><lpage>753</lpage><pub-id pub-id-type="doi">10.1509/jmr.12.0336</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hunt</surname><given-names>J</given-names> </name><name name-style="western"><surname>Eisenberg</surname><given-names>D</given-names> </name></person-group><article-title>Mental health problems and help-seeking behavior among college students</article-title><source>J Adolesc Health</source><year>2010</year><month>01</month><volume>46</volume><issue>1</issue><fpage>3</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1016/j.jadohealth.2009.08.008</pub-id><pub-id pub-id-type="medline">20123251</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Discrepancies between self-rated depression and observed depression severity: the effects of personality and dysfunctional attitudes</article-title><source>Gen Hosp Psychiatry</source><year>2021</year><volume>70</volume><fpage>25</fpage><lpage>30</lpage><pub-id pub-id-type="doi">10.1016/j.genhosppsych.2020.11.016</pub-id><pub-id pub-id-type="medline">33689981</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Harati</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rutowski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Obeid</surname><given-names>I</given-names> </name><name name-style="western"><surname>Picone</surname><given-names>J</given-names> </name><name name-style="western"><surname>Selesnick</surname><given-names>I</given-names> </name></person-group><article-title>Generalization of deep acoustic and NLP models for large-scale depression screening</article-title><source>Biomedical Sensing and Analysis</source><year>2022</year><publisher-name>Springer International Publishing</publisher-name><fpage>99</fpage><lpage>132</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-99383-2_318</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levis</surname><given-names>B</given-names> </name><name name-style="western"><surname>Benedetti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thombs</surname><given-names>BD</given-names> </name></person-group><article-title>Accuracy of Patient Health Questionnaire-9 (PHQ-9) for screening to detect major depression: individual participant data meta-analysis</article-title><source>BMJ</source><year>2019</year><volume>PMCID</volume><fpage>l1781</fpage><pub-id pub-id-type="doi">10.1136/bmj.l1781</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Metrics explanation and model receiver operating characteristic curve performance.</p><media xlink:href="ai_v4i1e69149_app1.docx" xlink:title="DOCX File, 1442 KB"/></supplementary-material></app-group></back></article>