<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e81868</article-id><article-id pub-id-type="doi">10.2196/81868</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Facial Expression&#x2013;Based Evaluation of the Emotion Estimation Software Kokoro Sensor in Healthy Individuals: Validation and Reliability Pilot Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Yoshihara</surname><given-names>Shota</given-names></name><degrees>MSc, OTR</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Amano</surname><given-names>Satoru</given-names></name><degrees>DMSc, OTR</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Takahashi</surname><given-names>Kayoko</given-names></name><degrees>ScD, OTR</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Rehabilitation Sciences, Kitasato University Graduate School of Medical Sciences</institution><addr-line>Kanagawa</addr-line><country>Japan</country></aff><aff id="aff2"><institution>School of Allied Health Science, Kitasato University</institution><addr-line>1-15-1, Kitasato, Minami-ku, Sagamihara</addr-line><addr-line>Kanagawa</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mansoor</surname><given-names>Masab</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Larionow</surname><given-names>Pawel</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Satoru Amano, DMSc, OTR, School of Allied Health Science, Kitasato University, 1-15-1, Kitasato, Minami-ku, Sagamihara, Kanagawa, 252-0373, Japan, 81 042 778 9849; <email>s-amano@kitasato-u.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>26</day><month>2</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e81868</elocation-id><history><date date-type="received"><day>05</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>27</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Shota Yoshihara, Satoru Amano, Kayoko Takahashi. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 26.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e81868"/><abstract><sec><title>Background</title><p>In recent years, artificial intelligence (AI) systems have increasingly been used to assess emotional states in health care. AI offers a safe, quick, user-friendly, and objective emotional evaluation method. However, evidence supporting its implementation in health care remains limited.</p></sec><sec><title>Objective</title><p>This study aimed to explore the concurrent validity and test-retest reliability of emotion recognition AI based on facial expressions.</p></sec><sec sec-type="methods"><title>Methods</title><p>In this study, we used the Kokoro Sensor, an accurate and widely recognized automated facial expression recognition system. The Japanese version of the Profile of Mood States&#x2013;Short Form was used to screen the potential influence of mental states on facial expressions. The study participants made positive, negative, and neutral expressions, which were analyzed by the emotion recognition AI. Agreement between the results of the AI and subjective evaluations was assessed by participants and a researcher using a 4-point Likert-type scale. The facial expressions and emotion analysis process were repeated after a 30-minute interval to investigate reliability. Concurrent validity was evaluated using the content validity index (CVI) and &#x03BA; coefficient, and test-retest reliability was determined using the &#x03BA; coefficient.</p></sec><sec sec-type="results"><title>Results</title><p>The study participants were 40 individuals whose mental states did not deviate from the reference range of the Profile of Mood States manual. Among the participants, the CVI values for positive, neutral, and negative expressions were 95%, 98%, and 85%, respectively. Among the researchers, the corresponding CVI values were 100%, 100%, and 70%, respectively. The overall weighted &#x03BA; coefficient was 0.55 (CI 0.44&#x2010;0.67), indicating moderate agreement. The agreement was almost perfect for distinguishing positive from neutral expressions (&#x03BA;=0.83, 95% CI 0.70&#x2010;0.95) but not statistically significant for distinguishing negative from neutral expressions (&#x03BA;=0.15, 95% CI &#x2013;0.07 to 0.37). Test-retest reliability analysis showed an overall weighted &#x03BA; coefficient of 0.66, reflecting substantial reliability. Almost perfect agreement was observed for distinguishing positive from neutral expressions (&#x03BA;=0.85, 95% CI 0.73&#x2010;0.97), while distinguishing negative from neutral expressions showed limited reliability (&#x03BA;=0.36, 95% CI 0.16&#x2010;0.57).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings suggest that the Kokoro Sensor may be useful for identifying positive affect, given its acceptable concurrent validity for overall valence estimation and its high agreement for distinguishing positive from neutral expressions. However, concurrent validity for negative expressions did not meet the prespecified benchmark based on the researcher&#x2019;s ratings, and agreement for distinguishing negative from neutral expressions was limited, which may constrain clinical utility for detecting negative affect. Therefore, in clinical settings, the Kokoro Sensor should be used as an assistive tool rather than a stand-alone method.</p></sec></abstract><kwd-group><kwd>emotion detection technology</kwd><kwd>facial expressions</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>emotion AI</kwd><kwd>rehabilitation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In recent years, artificial intelligence (AI) systems designed to predict human emotional states have garnered significant attention, especially in health care settings. These emotion recognition technologies have been the focus of intense research [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], using techniques such as facial recognition [<xref ref-type="bibr" rid="ref3">3</xref>], speech analysis [<xref ref-type="bibr" rid="ref4">4</xref>], text processing [<xref ref-type="bibr" rid="ref5">5</xref>], and electroencephalography-based brain activity monitoring [<xref ref-type="bibr" rid="ref6">6</xref>]. In health care settings, AI-driven real-time emotion recognition holds substantial promise, enabling providers to assess psychological states, such as pain and anxiety, and develop more personalized treatment plans swiftly. This technology addresses the limitations of traditional methods, which often depend on subjective patient self-reports, interviews, or clinician observations.</p><p>One of the most established methods for emotion recognition is the facial action coding system (FACS), a technique renowned for its precision in facial expression analysis [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. However, FACS is a human-driven method that requires not only substantial time for facial expression classification but also extensive training to acquire the necessary specialized skills [<xref ref-type="bibr" rid="ref12">12</xref>]. Consequently, its practical applicability in fast-paced clinical settings, where timely responses are essential, is limited. Accordingly, automated and efficient AI solutions that can be seamlessly integrated into clinical workflows are needed.</p><p>The Kokoro Sensor (CAC Inc) is a commercially available AI system for automated facial expression analysis that identifies 21 facial expressions and 7 basic emotions and outputs probability-based scores (0&#x2010;100) along with valence labels (positive, neutral, and negative) using algorithms grounded in Ekman basic emotion theory and FACS [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. According to publicly available documentation [<xref ref-type="bibr" rid="ref13">13</xref>], the underlying deep learning models were trained and tested on a corpus exceeding 14 million videos from 90 countries, providing substantial geographic diversity. Although detailed demographic composition is not disclosed, its size and international coverage are presumed to support model robustness and facilitate cross-cultural generalizability.</p><p>However, despite its use in a clinical setting [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>], the validity and reliability of the Kokoro Sensor for applications involving health care populations remain largely unexamined.</p><p>For an AI emotion tool to achieve clinical credibility, psychometric performance&#x2014;particularly validity and reliability&#x2014;should be established in line with the Consensus-based Standards for the Selection of Health Measurement Instruments (COSMIN) framework [<xref ref-type="bibr" rid="ref17">17</xref>]. Moreover, the evaluation should account for factors that can influence facial expressions across populations, such as cross-cultural variation in display rules (eg, between Western and Eastern populations) [<xref ref-type="bibr" rid="ref18">18</xref>], individual variability and population-level anatomical differences in facial musculature [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref23">23</xref>], and the greater suppression of facial movements reported in Eastern cohorts [<xref ref-type="bibr" rid="ref24">24</xref>]. Given these considerations, a focus on valence (positive or neutral or negative), rather than fine-grained discrete emotions, may offer a more robust and reproducible target across diverse groups.</p><p>To address this gap, this pilot study evaluated the Kokoro Sensor&#x2019;s (1) concurrent validity&#x2014;agreement between its valence outputs and human ratings&#x2014;and (2) test-retest reliability in healthy young Japanese adults. Establishing these properties provides evidence for the potential adjunctive use of this in clinical assessment.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Participants and Eligibility</title><p>Between February and June 2024, participants were recruited via posters; interested individuals contacted the first author (SY) either by email or in person. The first author coordinated enrollment, provided study information, and obtained written informed consent, and either SY or SA was present at all experimental sessions.</p><p>Eligible participants were Japanese adults aged 18&#x2010;30 years who were able to attend in-person laboratory sessions. The exclusion criteria were as follows: (1) a history of, or current, facial neuromuscular disorder; (2) a diagnosed psychiatric disorder; (3) insomnia; and (4) self-reported current treatment for, or current symptoms consistent with, a sleep disorder, a stress-related condition, or fatigue, assessed via a brief self-report screening conducted verbally at enrollment; or (5) a Japanese Profile of Mood States&#x2013;Short Form (POMS-SF) Total Mood Disturbance (TMD) T-score of 70 or higher. These exclusion criteria were based on self-report and were not verified by clinical diagnosis or standardized screening thresholds.</p></sec><sec id="s2-2"><title>Mood Assessment</title><p>Current mood state was evaluated using the POMS-SF questionnaire [<xref ref-type="bibr" rid="ref25">25</xref>]. The POMS-SF indexes transient mood across 7 subscales: Anger-Hostility, Confusion-Bewilderment, Depression-Dejection, Fatigue-Inertia, Tension-Anxiety, Vigor-Activity, and Friendliness. TMD was calculated as the sum of negative subscales (Anger-Hostility, Confusion-Bewilderment, Depression-Dejection, Fatigue-Inertia, and Tension-Anxiety) minus the sum of positive subscales (Vigor-Activity, Friendliness), with higher scores indicating greater mood disturbance.</p><p>POMS-SF scores were standardized by sex and age in accordance with the guidelines outlined in the POMS-SF manual [<xref ref-type="bibr" rid="ref26">26</xref>]. We prespecified TMD 70 or higher as an exclusion threshold to avoid testing during periods of marked negative mood, which can blunt positive facial expressivity. By contrast, elevated positive-mood scores do not indicate affective distress and generally do not preclude the ability to produce instructed negative expressions; therefore, they were not used as exclusion criteria.</p></sec><sec id="s2-3"><title>Sample Size</title><p>In this study, the required sample size for calculating the weighted &#x03BA; coefficient, the primary analysis, was estimated based on a previous study [<xref ref-type="bibr" rid="ref27">27</xref>]. The estimation was conducted under the assumption of a 2-sided significance level of .05 and a statistical power of 0.8. Given a planned weighted &#x03BA; coefficient of 0.8 for a 3&#x00D7;3 contingency table, the sample size necessary to achieve the desired precision was calculated to be 39 participants. To mitigate the potential impact of participant attrition and unusable data, the final target sample size was adjusted to 40 participants.</p></sec><sec id="s2-4"><title>Study Setting</title><p>This cross-sectional study was conducted between March and June 2024 using a structured experimental design. Standardized equipment, including a high-resolution web camera (HD Webcam Meet, model number C960; EMEET), which was externally mounted on top of a personal computer, was used across all conditions. All experiments were carried out in a controlled, quiet laboratory environment.</p></sec><sec id="s2-5"><title>Study Flow and Procedure</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows the overall experimental flow. This study followed a structured experimental design, with a total time lasting approximately 50 minutes. This included 5 minutes for obtaining informed consent regarding the video recording for facial analysis, 5 minutes for instructions, 10 minutes for the experiment, and a 30-minute break.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall experimental flow.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e81868_fig01.png"/></fig><p>This 2-session, within-participant study was conducted in Japan and enrolled 40 healthy young individuals (aged 18&#x2010;30 y). After written informed consent and instructions (10 min), participants were video-recorded while mimicking neutral, positive, and negative facial expressions (0.5 min each), followed by a 5-min task in which both participants and the researcher judged how well the AI-based valence classification of each facial expression matched the emotion conveyed. After a 30-minute break, the 3 mimicry blocks were repeated in a second session. The total time per participant was approximately 50 minutes.</p><p>Participants first completed a brief warm-up in which they practiced producing each target facial expression (positive, neutral, and negative) for approximately 10 seconds per expression, under the researcher&#x2019;s guidance, to familiarize themselves with the procedure. The researcher then checked camera framing and instructed participants to adjust posture and face position to ensure consistent alignment for video recording. In the first phase, the participants were asked to display neutral (ie, normal), positive (ie, happy), and negative (eg, sad, frustrated, disgusted) facial expressions, each for 30 seconds, directed at the PC camera. The participants were not shown the results of the Kokoro Sensor analysis during this phase.</p><p>Additionally, both the participants and a researcher (SY) used a 4-point Likert-type scale with predefined anchors to evaluate the degree of agreement between the Kokoro Sensor&#x2019;s emotion detection and the participants&#x2019; self-reported emotional states. Importantly, the objective of this study was not to establish whether the Kokoro Sensor adheres to a FACS-based microexpression taxonomy but instead to determine whether it can detect broad emotional valence of the kind typically evaluated in clinical contexts. Consequently, the human rating task was restricted to positive, neutral, and negative valence, which can be judged reliably without specialized FACS training. The rater&#x2019;s role, therefore, centered on global valence matching rather than fine-grained action-unit coding, which we considered an appropriate level of expertise for the present aims.</p><p>To assess agreement, both participants and the researcher responded to specific prompts using a 4-point Likert scale (1=strongly disagree, 2=disagree, 3=agree, 4=strongly agree). Participants evaluated the prompt: &#x201C;How well does the Kokoro Sensor&#x2019;s result match the emotion you intended to express in this block?&#x201D; In contrast, the researcher evaluated the prompt: &#x201C;How well does the Kokoro Sensor&#x2019;s result match the target emotion instructed for this block?&#x201D; Notably, a neutral option was omitted to encourage decisive responses regarding the congruence between the sensor&#x2019;s detections and participants&#x2019; emotions. This process was then repeated after a 30-minute interval to investigate reliability.</p></sec><sec id="s2-6"><title>Statistical Analyses: Concurrent Validity Relative to Human Ratings</title><p>As described above in the study procedure, concurrent validity was assessed by comparing the Kokoro Sensor&#x2019;s AI-estimated valence classifications (neutral, positive, and negative) with human ratings. For each experimental block, we computed the proportions of frames classified as positive, neutral, and negative. Each block was labeled by the modal valence (ie, the category with the largest frame proportion). Block-level frame consistency was defined as the modal-valence proportion (ie, the maximum of the 3 frame proportions). Blocks with frame consistency of 75% or higher were classified as &#x201C;stable,&#x201D; and those with frame consistency of less than 75% as &#x201C;unstable.&#x201D; All blocks were retained; when frame consistency was less than 75%, the block was still labeled using the modal valence but interpreted as low consistency.</p><p>Concurrent validity was evaluated separately against participants&#x2019; ratings and the researcher&#x2019;s ratings as 2 independent reference standards. When participants and the researcher disagreed for the same block, no adjudication (eg, consensus, averaging, or exclusion) was performed; both ratings were retained and analyzed independently, and interrater agreement between the participant and researcher ratings was not calculated. Following the content validity index (CVI) approach [<xref ref-type="bibr" rid="ref28">28</xref>], perfect agreement (%) for each valence category was calculated as the proportion of ratings scored 3 or 4 on the 4-point Likert-type scale and was computed separately for participants and the researcher. In this study, a CVI value of 0.75 or higher was considered an acceptable level of concurrent validity.</p><p>Although the CVI is widely used, it does not account for inflated values resulting from chance agreement. To address this limitation, the weighted &#x03BA; statistic [<xref ref-type="bibr" rid="ref29">29</xref>] was calculated for agreement between the Kokoro Sensor&#x2019;s AI-estimated valence classifications and the target-posed valence condition to provide a more robust test of overall agreement. This analysis evaluated agreement against the intended experimental condition rather than the Likert-based human ratings. Notably, the CVI was calculated against human perceived valence ratings, whereas the weighted &#x03BA; was calculated against the intended posed-valence condition, therefore addressing complementary but non-identical reference standards. Additionally, &#x03BA; coefficients were calculated separately for distinguishing positive from neutral valence and for distinguishing negative from neutral valence.</p></sec><sec id="s2-7"><title>Test-Retest Reliability</title><p>We assessed the test-retest reliability of the Kokoro Sensor by comparing its valence classifications between session 1 and session 2, which were separated by a 30-minute interval. In both sessions, the same experimental procedures (3 emotional-expression mimicry blocks) were administered. For each 30-second block, framewise predictions were summarized as the proportions of frames classified as positive, neutral, and negative valence. Each block was labeled by the modal valence (ie, the category with the largest frame proportion). Block-level frame consistency was defined as the modal-valence proportion (maximum frame proportion). The same 75% threshold was used to stratify analyses by block-level consistency, classifying blocks as stable (&#x2265;75%) or unstable (&#x003C;75%), for sensitivity analyses.</p><p>For statistical analysis, we computed both weighted &#x03BA; and unweighted &#x03BA; coefficients with 95% CI. Test-retest reliability was quantified using the weighted &#x03BA; for overall agreement across the 3 valence categories (positive, neutral, and negative) and unweighted &#x03BA; for pairwise contrasts (distinguishing positive from neutral and distinguishing negative from neutral). All analyses were conducted in R (version 4.3.1, &#x201C;Beagle Scouts&#x201D;). &#x03BA; values were calculated using the kappa.stat function (Aoki, Gunma University), implemented via the <italic>vcd</italic> package together with supplemental functions sourced from the publicly available script repository [<xref ref-type="bibr" rid="ref30">30</xref>]. Agreement outcomes for each valence category were organized into cross-classification tables for &#x03BA; calculation.</p></sec><sec id="s2-8"><title>Interpretation of the &#x03BA; Coefficient</title><p>In this study, the interpretation of the weighted &#x03BA; coefficients was based on standard thresholds for domain-specific judgments: values less than 0.20 were classified as indicating poor agreement, 0.21 to 0.40 or less as fair agreement, 0.41 to 0.60 or less as moderate agreement, 0.61 to 0.80 or less as substantial agreement, and greater than 0.81 as almost perfect agreement [<xref ref-type="bibr" rid="ref29">29</xref>].</p></sec><sec id="s2-9"><title>Sensitivity Analyses</title><p>The robustness of the findings was assessed in 3 ways. First, to examine whether agreement differed by within-block valence stability, analyses were stratified by block-level consistency, defined as the modal-valence proportion (ie, the maximum proportion of frames assigned to a single valence within a block), with blocks classified as stable blocks (consistency &#x2265;75%) or unstable blocks (consistency &#x003C;75%). Second, to examine sensitivity to the definition of &#x201C;high-consistency&#x201D; blocks, we repeated the &#x03BA;-based analyses after restricting the dataset to blocks whose modal-valence proportion met alternative frame consistency thresholds (&#x2265;60% and &#x2265;90%). Third, analyses were repeated after stratification by sex.</p><p>All analyses were performed using R (version 4.3.1; available at [<xref ref-type="bibr" rid="ref31">31</xref>]). The level of statistical significance was set at <italic>P</italic>&#x003C;.05 (2-tailed).</p></sec><sec id="s2-10"><title>Ethical Considerations</title><p>Written informed consent was obtained from each participant prior to their involvement in the study. This study adhered to the ethical principles outlined in the Declaration of Helsinki and was approved by the ethics review board of the School of Allied Health Sciences at Kitasato University (approval number 2023&#x2010;032). All collected data were anonymized before analysis to ensure participant confidentiality and privacy. All study participants were compensated with a QUO card valued at 1000 JPY (approximately US $7) as an honorarium.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>The characteristics of the participants (N=40; n=24, 60% male; median age: 21.0, IQR 21.0&#x2010;22.0 y) are shown in <xref ref-type="table" rid="table1">Table 1</xref>. In addition, POMS-SF descriptive statistics are summarized in <xref ref-type="table" rid="table2">Table 2</xref>. T-scores were generally centered on the normative mean, with few elevations of 70 or higher across subscales; positive dimensions were higher (Vigor-Activity mean: 55.0, SD 10.3; F mean: 58.7, SD 9.5). No participant met the exclusion threshold (TMD &#x2265;70); all 40 were included in the analyses.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of the participants (N=40).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Values</td></tr><tr><td align="left" valign="bottom">Age (y), median (IQR)</td><td align="left" valign="bottom">21.0 (21.0&#x2010;22.0)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Sex, n (%)</td></tr><tr><td align="left" valign="top">&#x2003;Male</td><td align="left" valign="top">24 (60)</td></tr><tr><td align="left" valign="top">&#x2003;Female</td><td align="left" valign="top">16 (40)</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Profile of Mood States&#x2013;Short Form (POMS-SF) score descriptors (N=40).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Scale</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Median (IQR)</td><td align="left" valign="bottom">Min-max T-score</td><td align="left" valign="bottom">T-score &#x2265;70, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">TMD<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> score</td><td align="left" valign="top">46.1 (9.6)</td><td align="left" valign="top">42.5 (39.5-50)</td><td align="left" valign="top">31-66</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">AH<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> score</td><td align="left" valign="top">43.5 (7.2)</td><td align="left" valign="top">41.0 (38.0-46.0)</td><td align="left" valign="top">36-64</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">CB<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> score</td><td align="left" valign="top">50.1 (11.0)</td><td align="left" valign="top">46.5 (41.0-59.0)</td><td align="left" valign="top">36-76</td><td align="left" valign="top">2 (5)</td></tr><tr><td align="left" valign="top">DD<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> score</td><td align="left" valign="top">48.5 (8.7)</td><td align="left" valign="top">45.0 (42.0-54.0)</td><td align="left" valign="top">39-68</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">FI<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> score</td><td align="left" valign="top">46.5 (9.6)</td><td align="left" valign="top">44.0 (41.0-51)</td><td align="left" valign="top">33-73</td><td align="left" valign="top">1 (2.5)</td></tr><tr><td align="left" valign="top">TA<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup> score</td><td align="left" valign="top">49.0 (9.9)</td><td align="left" valign="top">47.0 (42.0-57)</td><td align="left" valign="top">35-71</td><td align="left" valign="top">1 (2.5)</td></tr><tr><td align="left" valign="top">VA<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup> score</td><td align="left" valign="top">55.0 (10.3)</td><td align="left" valign="top">55.0 (46.5-62.0)</td><td align="left" valign="top">36-74</td><td align="left" valign="top">3 (7.5)</td></tr><tr><td align="left" valign="top">F<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup> score<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup></td><td align="left" valign="top">58.7 (9.5)</td><td align="left" valign="top">59.5 (53.0-66.0)</td><td align="left" valign="top">38-78</td><td align="left" valign="top">4 (10)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>TMD: Total Mood Disturbance.</p></fn><fn id="table2fn2"><p><sup>b</sup>AH: Anger-Hostility.</p></fn><fn id="table2fn3"><p><sup>c</sup>CB: Confusion-Bewilderment.</p></fn><fn id="table2fn4"><p><sup>d</sup>DD: Depression-Dejection.</p></fn><fn id="table2fn5"><p><sup>e</sup>FI: Fatigue-Inertia.</p></fn><fn id="table2fn6"><p><sup>f</sup>TA: Tension-Anxiety.</p></fn><fn id="table2fn7"><p><sup>g</sup>VA: Vigor-Activity.</p></fn><fn id="table2fn8"><p><sup>h</sup>F: Friendliness.</p></fn><fn id="table2fn9"><p><sup>i</sup>Friendliness is not included in TMD.</p></fn></table-wrap-foot></table-wrap><p>The success rate for meeting the frame consistency criterion, defined as 75% or higher of frames classified under a single valence within a 30-second block, was 86.7% (104/120). Accordingly, 13.3% (16/120) of the blocks were classified as low consistency at the 75% threshold. In sensitivity analyses, the success rates were 93.3% (112/120) at the 60% threshold and 81.7% (98/120) at the 90% threshold.</p><p><xref ref-type="table" rid="table3">Table 3</xref> shows the CVI values, while <xref ref-type="table" rid="table4">Table 4</xref> presents the weighted &#x03BA; and &#x03BA; coefficients for the concurrent validity assessments. As the &#x201C;percentage of perfect agreement (%)&#x201D; corresponds to the proportion of ratings scored 3 or greater on the Likert scale, these values are reported as CVI in <xref ref-type="table" rid="table3">Table 3</xref>. Among the participants, the CVI values for positive, neutral, and negative expressions were 95%, 98%, and 85%, respectively; for the researcher, the corresponding CVI values were 100%, 100%, and 70%. The overall weighted &#x03BA; coefficient was 0.55 (95% CI 0.44&#x2010;0.67), indicating moderate agreement. For distinguishing positive from neutral expressions, the &#x03BA; coefficient was 0.83 (95% CI 0.70&#x2010;0.95), indicating almost perfect agreement. For distinguishing negative from neutral expressions, the &#x03BA; coefficient was 0.15 (95% CI &#x2013;0.07 to 0.37), indicating no statistical significance.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Concurrent validity for positive, neutral, and negative expressions<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Expression</td><td align="left" valign="bottom">CVI<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> (%)&#x2014;participants</td><td align="left" valign="bottom">CVI (%)&#x2014;researcher</td></tr></thead><tbody><tr><td align="left" valign="top">Positive</td><td align="left" valign="top">95</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">Neutral</td><td align="left" valign="top">98</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">Negative</td><td align="left" valign="top">85</td><td align="left" valign="top">70</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>For each expression category, the content validity index was calculated as the proportion of ratings scored 3 (agree) or 4 (strongly agree) on a 4-point Likert-type scale, divided by the total number of ratings, and expressed as a percentage. For example, for positive expressions rated by participants, if 38 out of 40 ratings were 3 or 4, then content validity index was 95%.</p></fn><fn id="table3fn2"><p><sup>b</sup>CVI: content validity index.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Kappa coefficients for concurrent validity assessments across expression categories<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Concurrent validity</td><td align="left" valign="bottom">&#x03BA; coefficient</td><td align="left" valign="bottom" colspan="2">95% CI</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Lower</td><td align="left" valign="bottom">Upper</td></tr></thead><tbody><tr><td align="left" valign="top">Overall valence classification<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.44</td><td align="left" valign="top">0.67</td></tr><tr><td align="left" valign="top">Distinguishing positive from neutral</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top">Distinguishing negative from neutral</td><td align="left" valign="top">0.15</td><td align="left" valign="top">&#x2013;0.07</td><td align="left" valign="top">0.37</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>For each 30-second block, framewise classifications were summarized as the proportions of frames classified as positive, neutral, and negative. Blocks were retained regardless of frame consistency and were labeled using the modal valence (largest frame proportion). Frame consistency was defined as the modal-valence proportion (maximum frame proportion); the 75% threshold was used to classify blocks as stable (&#x2265;75%) or unstable (&#x003C;75%) in stratified sensitivity analyses (Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></fn><fn id="table4fn2"><p><sup>b</sup>Weighted &#x03BA; coefficient.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table5">Table 5</xref> presents the results of the test-retest reliability analysis. Test-retest agreement for distinguishing positive from neutral expressions was satisfactory, whereas agreement for distinguishing negative from neutral expressions was inadequate. The overall weighted &#x03BA; coefficient was 0.66 (95% CI 0.55&#x2010;0.76), with &#x03BA; coefficients of 0.85 (95% CI 0.73&#x2010;0.97) for distinguishing positive from neutral expressions and 0.36 (95% CI 0.16&#x2010;0.57) for distinguishing negative from neutral expressions.</p><p>Sensitivity analyses were conducted to assess robustness. First, analyses were stratified by block-level consistency (consistency &#x2265;75% or &#x003C;75%). In stable blocks (consistency &#x2265;75%), &#x03BA;-based concurrent validity estimates and test-retest reliability estimates were comparable to or slightly higher than those in the main analyses (Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In unstable blocks (consistency &#x003C;75%), the overall concurrent validity remained relatively preserved (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), whereas &#x03BA;-based contrasts showed lower agreement and less precise estimates (Tables S2 and S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), likely due to sparse cell counts. Second, the results were materially unchanged when alternative thresholds of 60% and 90% were applied (Tables S4-S6 for concurrent validity and Tables S7-S9 for test-retest reliability in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Third, analyses stratified by sex showed broadly similar patterns, except for female participants in the &#x201C;distinguishing negative from neutral&#x201D; condition under alternative thresholds (Tables S7-S9 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Test-retest reliability analysis: &#x03BA; coefficients for positive, neutral, and negative expressions<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Test-retest reliability</td><td align="left" valign="bottom">&#x03BA; coefficient</td><td align="left" valign="bottom" colspan="2">95% CI</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Lower</td><td align="left" valign="bottom">Upper</td></tr></thead><tbody><tr><td align="left" valign="top">Overall valence classification<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">0.66</td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.76</td></tr><tr><td align="left" valign="top">Distinguishing positive from neutral</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.97</td></tr><tr><td align="left" valign="top">Distinguishing negative from neutral</td><td align="left" valign="top">0.36</td><td align="left" valign="top">0.16</td><td align="left" valign="top">0.57</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>For each 30-second block, framewise classifications were summarized as the proportions of frames classified as positive, neutral, and negative. Blocks were retained regardless of frame consistency and were labeled using the modal valence (largest frame proportion). Frame consistency was defined as the modal-valence proportion (maximum frame proportion); the 75% threshold was used to classify blocks as stable (&#x2265;75%) or unstable (&#x003C;75%) in stratified sensitivity analyses (Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The results of the analyses are shown as &#x03BA; coefficients and corresponding 95% CI.</p></fn><fn id="table5fn2"><p><sup>b</sup>Weighted &#x03BA; coefficient.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study assessed the concurrent validity and test-retest reliability of the Kokoro Sensor, an AI-based tool designed to detect emotional states based on facial expressions. The findings indicated that concurrent validity and reliability were satisfactory for overall valence classification and for distinguishing positive from neutral expressions. On the other hand, the concurrent validity for distinguishing negative from neutral expressions was not statistically significant, and limited reliability was observed. These findings represent a first step in exploring the potential clinical applications of this tool.</p></sec><sec id="s4-2"><title>Interpretation of the Findings for Distinguishing Positive From Neutral Findings</title><p>These findings showed satisfactory concurrent validity and test-retest reliability for distinguishing positive from neutral expressions. The CVI for distinguishing positive from neutral expressions exceeded both the concurrent validity benchmark set in our study (CVI &#x2265;0.75) and that set in previous studies (CVI &#x003E;0.78) [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], supporting adequate concurrent validity. Additionally, the &#x03BA; coefficient for reliability was 0.85 (95% CI 0.73&#x2010;0.97), indicating almost perfect agreement, as a &#x03BA; coefficient 0.81 or higher is generally considered indicative of this level of agreement [<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>This result suggests a high level of consistency in distinguishing between positive and neutral expressions. In support of these findings, previous studies have shown that distinguishing between positive and neutral expressions is generally clear and associated with minimal ambiguity, which facilitates consistent interpretation and AI processing [<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. For example, a previous study suggested that positive expressions such as smiles are generally more consistent because they involve clear changes in specific facial areas (eg, the mouth), which makes them easier for AI to recognize [<xref ref-type="bibr" rid="ref36">36</xref>]. Another previous study reported that AI systems generally trained on datasets often learn positive expressions more extensively because of their higher prevalence in daily life, resulting in improved processing accuracy for positive emotions [<xref ref-type="bibr" rid="ref37">37</xref>]. This phenomenon may also apply to the Kokoro Sensor dataset.</p></sec><sec id="s4-3"><title>Challenges in Distinguishing Negative From Neutral Expressions</title><p>By contrast, the CVI for negative expressions was 0.85 for participants and 0.70 for the researcher, indicating that the participants&#x2019; ratings met the concurrent validity benchmark set in our study (CVI&#x2265;0.75), whereas the researcher&#x2019;s ratings for negative expressions failed to meet this benchmark. In addition, the ability to distinguish between negative and neutral valence was not statistically significant, as the &#x03BA; for concurrent validity was 0.15 (95% CI &#x2013;0.07 to 0.37). Additionally, the reliability of these distinctions showed limited agreement, with a &#x03BA; of 0.36 (95% CI 0.16&#x2010;0.57).</p><p>Notably, our findings suggested that the inconsistency between the CVI of negative valence and the &#x03BA; coefficient for distinguishing negative from neutral may be attributable to differences in the comparator and agreement metrics. The CVI indicates the degree of agreement between the results of the Kokoro Sensor and the participants&#x2019; and researchers&#x2019; judgments of negative valence, while the &#x03BA; coefficient provides a chance-corrected index of agreement for distinguishing negative from neutral valence. Our findings suggest that although the Kokoro Sensor may estimate negative emotions from facial expressions, it might not adequately distinguish between negative and neutral expressions estimated from facial expressions.</p><p>There is 1 possible explanation for the difficulty in distinguishing between negative and neutral expressions. Negative facial expressions consist of smaller movement changes in facial expression muscle configurations [<xref ref-type="bibr" rid="ref38">38</xref>], which makes them less recognizable than positive expressions. Consequently, the boundary between negative and neutral expressions is frequently less distinct than that between positive and neutral expressions [<xref ref-type="bibr" rid="ref38">38</xref>], a challenge that may be further exacerbated by culturally shaped tendencies toward subdued or suppressed negative expressivity, particularly in East Asian populations. Furthermore, Affectiva [<xref ref-type="bibr" rid="ref13">13</xref>] reports that its models are trained and tested on a global dataset of over 14 million videos collected from 90 countries but does not disclose detailed demographic information (eg, ethnicity and the proportion of East Asian faces). If East Asian populations are underrepresented compared to the intended deployment environment, a distribution shift between training and use populations could contribute to the reduced performance in the Japanese cohort, particularly for subtle negative facial expressions. This possibility is consistent with broader concerns that facial analysis performance may vary across demographic subgroups.</p><p>In addition, the study protocol itself may have contributed to reducing within-block consistency in framewise valence classifications. Participants were required to hold a posed facial expression for 30 seconds, and sustaining a static configuration for this duration may induce facial muscle fatigue and gradual, natural relaxation toward a more neutral state. Such sustained-posing requests may result in time-dependent changes in expression intensity or muscle activation, which could increase frame-to-frame variability within a block and thereby elevate the proportion of blocks that failed to meet the prespecified benchmark set in our study.</p><p>The observed reliability should likewise be interpreted in light of this study protocol. Participants were instructed to reproduce the same target facial expression across sessions. However, because the protocol relied on posed expressions, session-to-session differences in how individuals enacted the target expressions (eg, intensity or configuration when reproducing &#x201C;sadness&#x201D;) may have reduced &#x03BA; even under stable sensor performance. The observed reliability likely reflects both sensor-related factors and within-participant inconsistency, which cannot be disentangled in this study design.</p></sec><sec id="s4-4"><title>Future Directions</title><p>These findings, along with previous consistent findings in both Western and Eastern contexts, emphasize the need for modifications based on new empirical evidence. Some previous studies in Western contexts have reported that emotional facial expressions generated based on scenarios (eg, &#x201C;show the facial expressions you would typically display when experiencing the emotions triggered by the following situations&#x201D;) are not always consistent with Ekman&#x2019;s theory of prototypical expressions [<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]. While the evidence is limited to Eastern contexts, 1 previous study [<xref ref-type="bibr" rid="ref42">42</xref>] using Ekman-based analyses with FaceReader reported that emotions such as &#x201C;happiness&#x201D; and &#x201C;surprise&#x201D; are recognizable, whereas the recognition of other emotional expressions (eg, anger, disgust, fear, sadness) remains more difficult. Given that previous reports have investigated cultural differences in facial expressions between Western and Eastern contexts [<xref ref-type="bibr" rid="ref18">18</xref>], and few studies have focused only on Eastern contexts, future research should investigate how cultural variations influence facial expression recognition and how these insights could be applied to improve AI systems.</p></sec><sec id="s4-5"><title>Clinical Applications</title><p>Our findings suggest that the Kokoro Sensor may have limited clinical utility for differentiating negative from neutral valence. The validity and reliability for estimating overall emotional valence and for distinguishing positive from neutral expressions were generally acceptable, indicating that the system may provide a useful indication of overall positive affect. Nevertheless, caution is warranted when using the Kokoro Sensor to differentiate negative from neutral valence, as its validity and reliability were weaker. Notably, the researcher&#x2019;s CVI for negative expressions (CVI=0.70) fell below the prespecified benchmark (CVI &#x2265;0.75). In particular, a &#x03BA; value of 0.36 for distinguishing negative from neutral expressions indicates limited reliability for this clinically relevant contrast, which further constrains stand-alone clinical use.</p><p>From a psychometric perspective, &#x03BA; values in the range of 0.40&#x2010;0.60 are typically interpreted as reflecting only moderate agreement, which is insufficient for stand-alone clinical decision-making. The &#x03BA; value of 0.55 observed for overall valence classification supports the use of the Kokoro Sensor as an adjunctive or screening aid rather than as an independent diagnostic instrument. Although there is no generally accepted consensus on &#x03BA; thresholds for AI-based facial-affect detection, many health care applications adopt &#x03BA; 0.80 or higher as a benchmark for diagnostic deployment. Accordingly, substantial gains in accuracy and temporal stability would be necessary before the Kokoro Sensor could be considered suitable for routine clinical decision-making.</p><p>The low agreement for distinguishing negative from neutral valence (&#x03BA;=0.15) further underscores an important limitation. When the sensor yields negative, ambiguous, or clinically incongruent outputs, clinicians should actively seek converging evidence from independent sources, such as structured observational scales, physiological indicators (eg, heart rate variability, actigraphy), or voice-based markers (eg, prosodic and other acoustic features).</p></sec><sec id="s4-6"><title>Limitations</title><p>This study has some limitations. First, the scenario-based induction of emotional facial expressions enabled a systematic examination across a wide range of emotions [<xref ref-type="bibr" rid="ref40">40</xref>]. For this pilot study, we used posed mimicry rather than mood induction to ensure a feasible and standardized protocol for initial device evaluation. This approach allowed for tighter control over target expressions and reduced procedural variability across participants and sessions. However, it may limit generalizability to naturalistic affective states. Posed expressions, even when scenario-based, do not fully capture the nuanced, blended, and transient facial displays that emerge spontaneously in real-world clinical settings. Such settings involve greater individual variability in imagery capacity and deliberate emotion masking (eg, of pain or anxiety)[<xref ref-type="bibr" rid="ref43">43</xref>], making posed expressions an imperfect surrogate and potentially overestimating performance. In addition, baseline POMS-SF scores indicated a relatively positive mood (Vigor-Activity T=55.0; Friendliness T=58.7), which may have made it more difficult for participants to authentically pose negative expressions (eg, reduced expressivity or emotion masking), potentially attenuating the negative-neutral contrast and thereby limiting the Kokoro Sensor&#x2019;s ability to differentiate negative from neutral expressions. Moreover, our blocked design&#x2014;using distinct 30-second segments per target emotion&#x2014;likely reduced ambiguity by constraining participants to a single labeled affective state at a time. While this improved experimental control, it may have limited the occurrence of transitional, mixed, or low-intensity expressions typical in natural contexts, thereby introducing spectrum bias and inflating validity metrics. Future studies should incorporate validated mood-induction paradigms (eg, standardized film clips or scripted scenarios) within clinical environments to enhance ecological validity [<xref ref-type="bibr" rid="ref44">44</xref>]. Second, the validity of the Kokoro Sensor in distinguishing negative from neutral expressions is uncertain, and any inferences about its detection of negative valence should be made cautiously. The &#x03BA; coefficient for this contrast was low and nonsignificant (&#x03BA;=0.15; 95% CI &#x2013;0.07 to 0.37), suggesting that the system failed to demonstrate reliable discrimination between negative and neutral valence. The wide confidence interval likely reflects a combination of the modest sample size and the intrinsic challenge of separating subtle negative expressions from neutral ones. Due to the use of an overall 3&#x00D7;3 weighted &#x03BA; test in the prior sample size determination, this study may have had limited power/precision for binary contrasts (eg, negative vs neutral). Consistent with the low &#x03BA; test values and wide confidence intervals observed for these contrasts, nonsignificant results should be interpreted with caution, as the possibility of a type 2 error cannot be excluded. Although a post hoc sex-stratified analysis suggested minimal sex differences overall, the apparent deviation observed among women in the &#x201C;negative from neutral&#x201D; condition should be interpreted cautiously because sex-specific hypotheses were not prespecified, and subgroup samples were small with imprecise estimates. Third, a subset of blocks did not meet the prespecified frame-consistency threshold (ie, &#x003C;75% of frames classified under a single valence). For these low-consistency blocks, block-level labels were assigned using the modal valence, which may be less reliable than labels derived from high-consistency segments. Therefore, aggregating low-consistency blocks with high-consistency blocks in the main analyses may obscure potentially important differences in Kokoro Sensor performance between more stable and less stable conditions, effectively averaging across heterogeneous performance conditions. Although stratified sensitivity analyses by block-level consistency were conducted (Tables S1-S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), estimates in the low-consistency subgroup should be interpreted cautiously due to sparse cell counts and reduced precision. Fourth, we assessed test-retest reliability over a 30-minute interval, which is brief and may allow memory or carryover effects. Longer intervals (eg, 24&#x2010;48 h) and multisession designs are needed to establish temporal stability more robustly. Fifth, human ratings relied on participants and a researcher using predefined anchors; this precluded estimation of interrater reliability. Additionally, because the 4-point Likert scale excluded a neutral midpoint by design, raters were compelled to choose a valence category when the AI valence classification was ambiguous. Such forced-choice responding may have biased responses toward the endorsement of a category and may have artificially increased observed agreement. Sixth, because the researcher provided instructions and was present during the sessions, the researcher&#x2019;s ratings were not blinded to the target emotion or condition in each block. This lack of blinding may have introduced observer bias into the researcher-rated dataset, potentially resulting in an overestimation of agreement estimates. Finally, our sample consisted of young Japanese adults, limiting generalizability to other age groups and to clinical populations where facial morphology (eg, wrinkles) or comorbid conditions (eg, facial palsy) may affect AI performance. Cross-cultural differences in expression production, such as those between Eastern and Western populations, may further influence recognition accuracy [<xref ref-type="bibr" rid="ref18">18</xref>]. Future studies should recruit broader samples that vary in age, culture, and clinical status.</p></sec><sec id="s4-7"><title>Conclusion</title><p>The findings of this study suggest that the Kokoro Sensor may be useful for identifying positive affect, given its acceptable concurrent validity for overall valence estimation and high agreement for distinguishing positive from neutral expressions. However, the prespecified benchmark for concurrent validity was not met for negative expressions based on the researcher&#x2019;s ratings, and agreement for distinguishing negative from neutral expressions was limited, which may constrain its clinical utility for detecting negative affect. Therefore, the Kokoro Sensor may be best used as an assistive tool rather than a stand-alone method in clinical settings.</p></sec></sec></body><back><ack><p>The authors would like to thank FORTE for English language editing. Grammar and style in the study were checked using the generative artificial intelligence tool ChatGPT [<xref ref-type="bibr" rid="ref45">45</xref>] and the software Grammarly [<xref ref-type="bibr" rid="ref46">46</xref>], solely to assist with language editing. No generative artificial intelligence tools were used to generate, analyze, or interpret the scientific content. The final content was reviewed and approved by all authors, who take full responsibility for its accuracy and integrity.</p></ack><notes><sec><title>Funding</title><p>This study was supported by a JSPS KAKENHI Grant-in-Aid for Early-Career Scientists (No. 23K16586).</p></sec><sec><title>Data Availability</title><p>The datasets generated and/or analyzed in this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>SY, SA, and KT designed the research. SY and SA obtained and analyzed the data. All authors wrote the manuscript and read and approved the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">COSMIN</term><def><p>Consensus-based Standards for the Selection of Health Measurement Instruments</p></def></def-item><def-item><term id="abb3">CVI</term><def><p>content validity index</p></def></def-item><def-item><term id="abb4">FACS</term><def><p>facial action coding system</p></def></def-item><def-item><term id="abb5">POMS-SF</term><def><p>Japanese Profile of Mood States&#x2013;Short Form</p></def></def-item><def-item><term id="abb6">TMD</term><def><p>Total Mood Disturbance</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tawsif</surname><given-names>K</given-names> </name><name name-style="western"><surname>Aziz</surname><given-names>NAA</given-names> </name><name name-style="western"><surname>Raja</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Hossen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jesmeen</surname><given-names>MZH</given-names> </name></person-group><article-title>A systematic review on emotion recognition system using physiological signals: data acquisition and methodology</article-title><source>Emerg Sci J</source><year>2022</year><volume>6</volume><issue>5</issue><fpage>1167</fpage><lpage>1198</lpage><pub-id pub-id-type="doi">10.28991/ESJ-2022-06-05-017</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x0160;umak</surname><given-names>B</given-names> </name><name name-style="western"><surname>Brdnik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pu&#x0161;nik</surname><given-names>M</given-names> </name></person-group><article-title>Sensors and artificial intelligence methods and algorithms for human-computer intelligent interaction: a systematic mapping study</article-title><source>Sensors (Basel)</source><year>2021</year><month>12</month><day>21</day><volume>22</volume><issue>1</issue><fpage>20</fpage><pub-id pub-id-type="doi">10.3390/s22010020</pub-id><pub-id pub-id-type="medline">35009562</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ko</surname><given-names>BC</given-names> </name></person-group><article-title>A brief review of facial emotion recognition based on visual information</article-title><source>Sensors (Basel)</source><year>2018</year><month>01</month><day>30</day><volume>18</volume><issue>2</issue><fpage>401</fpage><pub-id pub-id-type="doi">10.3390/s18020401</pub-id><pub-id pub-id-type="medline">29385749</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khalil</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>E</given-names> </name><name name-style="western"><surname>Babar</surname><given-names>MI</given-names> </name><name name-style="western"><surname>Jan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zafar</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Alhussain</surname><given-names>T</given-names> </name></person-group><article-title>Speech emotion recognition using deep learning techniques: a review</article-title><source>IEEE Access</source><year>2019</year><volume>7</volume><fpage>117327</fpage><lpage>117345</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2019.2936124</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alswaidan</surname><given-names>N</given-names> </name><name name-style="western"><surname>Menai</surname><given-names>MEB</given-names> </name></person-group><article-title>A survey of state-of-the-art approaches for emotion recognition in text</article-title><source>Knowl Inf Syst</source><year>2020</year><month>08</month><volume>62</volume><issue>8</issue><fpage>2937</fpage><lpage>2987</lpage><pub-id pub-id-type="doi">10.1007/s10115-020-01449-0</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Islam</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Moni</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Islam</surname><given-names>MM</given-names> </name><etal/></person-group><article-title>Emotion recognition from EEG signal focusing on deep learning and shallow learning techniques</article-title><source>IEEE Access</source><year>2021</year><volume>9</volume><fpage>94601</fpage><lpage>94624</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2021.3091487</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Crist</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Duncan</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Gallagher</surname><given-names>DL</given-names> </name></person-group><article-title>Protocol for data collection and analysis applied to automated facial expression analysis technology and temporal analysis for sensory evaluation</article-title><source>J Vis Exp</source><year>2016</year><month>08</month><day>26</day><issue>114</issue><fpage>54046</fpage><pub-id pub-id-type="doi">10.3791/54046</pub-id><pub-id pub-id-type="medline">27685862</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Crist</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Duncan</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Arnade</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Leitch</surname><given-names>KA</given-names> </name><name name-style="western"><surname>O&#x2019;Keefe</surname><given-names>SF</given-names> </name><name name-style="western"><surname>Gallagher</surname><given-names>DL</given-names> </name></person-group><article-title>Automated facial expression analysis for emotional responsivity using an aqueous bitter model</article-title><source>Food Qual Prefer</source><year>2018</year><month>09</month><volume>68</volume><fpage>349</fpage><lpage>359</lpage><pub-id pub-id-type="doi">10.1016/j.foodqual.2018.04.004</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ekman</surname><given-names>P</given-names> </name></person-group><source>Emotions Revealed: Recognizing Faces and Feelings to Improve Communication and Emotional Life</source><year>2003</year><access-date>2026-02-07</access-date><publisher-name>Times Books</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://psycnet.apa.org/record/2003-88051-000">https://psycnet.apa.org/record/2003-88051-000</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>Q</given-names> </name></person-group><article-title>Facial action unit recognition by exploiting their dynamic and semantic relationships</article-title><source>IEEE Trans Pattern Anal Mach Intell</source><year>2007</year><month>10</month><volume>29</volume><issue>10</issue><fpage>1683</fpage><lpage>1699</lpage><pub-id pub-id-type="doi">10.1109/TPAMI.2007.1094</pub-id><pub-id pub-id-type="medline">17699916</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Rosenberg</surname><given-names>EL</given-names> </name><name name-style="western"><surname>Ekman</surname><given-names>P</given-names> </name></person-group><article-title>Coherence between expressive and experiential systems in emotion</article-title><source>What the Face Reveals: Basic and Applied Studies of Spontaneous Expression Using the Facial Action Coding System (FACS)</source><year>1997</year><publisher-name>Oxford University Press</publisher-name><fpage>63</fpage><lpage>88</lpage><pub-id pub-id-type="doi">10.1093/oso/9780195104462.003.0004</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bartlett</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Hager</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Ekman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Sejnowski</surname><given-names>TJ</given-names> </name></person-group><article-title>Measuring facial expressions by computer image analysis</article-title><source>Psychophysiology</source><year>1999</year><month>03</month><volume>36</volume><issue>2</issue><fpage>253</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.1017/s0048577299971664</pub-id><pub-id pub-id-type="medline">10194972</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><article-title>Unlocking human insights: Affectiva&#x2019;s journey in data and emotion AI</article-title><source>Affectiva</source><access-date>2025-10-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://go.smarteye.se/unlocking-human-insights-affectivas-journey-in-data-and-emotion-ai">https://go.smarteye.se/unlocking-human-insights-affectivas-journey-in-data-and-emotion-ai</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wakihira</surname><given-names>T</given-names> </name><name name-style="western"><surname>Morimoto</surname><given-names>M</given-names> </name><name name-style="western"><surname>Higuchi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nagatomi</surname><given-names>Y</given-names> </name></person-group><article-title>Can facial expressions predict beer choices after tasting? A proof of concept study on implicit measurements for a better understanding of choice behavior among beer consumers</article-title><source>Food Qual Prefer</source><year>2022</year><month>09</month><volume>100</volume><fpage>104580</fpage><pub-id pub-id-type="doi">10.1016/j.foodqual.2022.104580</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ikeda</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kobayakawa</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nakao</surname><given-names>H</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Matsushita</surname><given-names>H</given-names> </name></person-group><article-title>Information technology/artificial intelligence innovations needed for better quality of life in caregiving homes</article-title><source>Health Informatics: Translating Information into Innovation</source><year>2021</year><publisher-name>Springer</publisher-name><fpage>37</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.1007/978-981-15-3781-3_3</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Case studies | Affectiva x CAC [Website in Japanese]</article-title><source>Affectiva</source><access-date>2024-07-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.affectiva.jp/affectiva-topics">https://www.affectiva.jp/affectiva-topics</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mokkink</surname><given-names>LB</given-names> </name><name name-style="western"><surname>Prinsen</surname><given-names>CAC</given-names> </name><name name-style="western"><surname>Bouter</surname><given-names>LM</given-names> </name><name name-style="western"><surname>De Vet</surname><given-names>HCW</given-names> </name><name name-style="western"><surname>Terwee</surname><given-names>CB</given-names> </name></person-group><article-title>The COnsensus-based standards for the selection of health Measurement INstruments (COSMIN) and how to select an outcome measurement instrument</article-title><source>Braz J Phys Ther</source><year>2016</year><month>01</month><day>19</day><volume>20</volume><issue>2</issue><fpage>105</fpage><lpage>113</lpage><pub-id pub-id-type="doi">10.1590/bjpt-rbf.2014.0143</pub-id><pub-id pub-id-type="medline">26786084</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Eibl-Eibesfeldt</surname><given-names>I</given-names> </name></person-group><source>Human Ethology</source><year>2017</year><publisher-name>Routledge</publisher-name><pub-id pub-id-type="doi">10.4324/9780203789544</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>D&#x2019;Andrea</surname><given-names>E</given-names> </name><name name-style="western"><surname>Barbaix</surname><given-names>E</given-names> </name></person-group><article-title>Anatomic research on the perioral muscles, functional matrix of the maxillary and mandibular bones</article-title><source>Surg Radiol Anat</source><year>2006</year><month>06</month><volume>28</volume><issue>3</issue><fpage>261</fpage><lpage>266</lpage><pub-id pub-id-type="doi">10.1007/s00276-006-0095-y</pub-id><pub-id pub-id-type="medline">16547604</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waller</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Cray</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Burrows</surname><given-names>AM</given-names> </name></person-group><article-title>Selection for universal facial emotion</article-title><source>Emotion</source><year>2008</year><month>06</month><volume>8</volume><issue>3</issue><fpage>435</fpage><lpage>439</lpage><pub-id pub-id-type="doi">10.1037/1528-3542.8.3.435</pub-id><pub-id pub-id-type="medline">18540761</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lemke</surname><given-names>BN</given-names> </name><name name-style="western"><surname>Dortzbach</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Park</surname><given-names>YG</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>HK</given-names> </name></person-group><article-title>The Asian upper eyelid: an anatomical study with comparison to the Caucasian eyelid</article-title><source>Arch Ophthalmol</source><year>1999</year><month>07</month><volume>117</volume><issue>7</issue><fpage>907</fpage><lpage>912</lpage><pub-id pub-id-type="doi">10.1001/archopht.117.7.907</pub-id><pub-id pub-id-type="medline">10408455</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shimada</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gasser</surname><given-names>RF</given-names> </name></person-group><article-title>Variations in the facial muscles at the angle of the mouth</article-title><source>Clin Anat</source><year>1989</year><month>01</month><volume>2</volume><issue>3</issue><fpage>129</fpage><lpage>134</lpage><pub-id pub-id-type="doi">10.1002/ca.980020302</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Gil</surname><given-names>YC</given-names> </name><etal/></person-group><article-title>Anatomical considerations regarding the location and boundary of the depressor anguli oris muscle with reference to botulinum toxin injection</article-title><source>Plast Reconstr Surg</source><year>2014</year><month>11</month><volume>134</volume><issue>5</issue><fpage>917</fpage><lpage>921</lpage><pub-id pub-id-type="doi">10.1097/PRS.0000000000000589</pub-id><pub-id pub-id-type="medline">25347627</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tzou</surname><given-names>CHJ</given-names> </name><name name-style="western"><surname>Giovanoli</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ploner</surname><given-names>M</given-names> </name><name name-style="western"><surname>Frey</surname><given-names>M</given-names> </name></person-group><article-title>Are there ethnic differences of facial movements between Europeans and Asians?</article-title><source>Br J Plast Surg</source><year>2005</year><month>03</month><volume>58</volume><issue>2</issue><fpage>183</fpage><lpage>195</lpage><pub-id pub-id-type="doi">10.1016/j.bjps.2004.10.014</pub-id><pub-id pub-id-type="medline">15710113</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Konuma</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hirose</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yokoyama</surname><given-names>K</given-names> </name></person-group><article-title>Relationship of the Japanese translation of the profile of mood states second edition (POMS 2) to the first edition (POMS) [Article in Japanese]</article-title><source>Juntendo Med J</source><year>2015</year><volume>61</volume><issue>5</issue><fpage>517</fpage><lpage>519</lpage><pub-id pub-id-type="doi">10.14789/jmj.61.517</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>McNair</surname><given-names>DM</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Heuchert</surname><given-names>JP</given-names> </name></person-group><source>POMS 2 Japanese Manual [Book in Japanese]</source><year>2015</year><access-date>2026-02-07</access-date><publisher-name>Kaneko Shobo</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="http://ci.nii.ac.jp/ncid/BB19364342">http://ci.nii.ac.jp/ncid/BB19364342</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bujang</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Baharum</surname><given-names>N</given-names> </name></person-group><article-title>Guidelines of the minimum sample size requirements for Kappa agreement test</article-title><source>ebph</source><year>2022</year><volume>14</volume><issue>2</issue><pub-id pub-id-type="doi">10.2427/12267</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>Weighted kappa: nominal scale agreement with provision for scaled disagreement or partial credit</article-title><source>Psychol Bull</source><year>1968</year><month>10</month><volume>70</volume><issue>4</issue><fpage>213</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.1037/h0026256</pub-id><pub-id pub-id-type="medline">19673146</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>Kappa_stat.R</article-title><source>Aoki&#x2019;s Statistical Analysis (Gunma University)</source><access-date>2025-12-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://aoki2.si.gunma-u.ac.jp/R/src/kappa_stat.R">http://aoki2.si.gunma-u.ac.jp/R/src/kappa_stat.R</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>The R project for statistical computing</article-title><source>R Project</source><access-date>2026-02-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.r-project.org/">https://www.r-project.org/</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shrotryia</surname><given-names>VK</given-names> </name><name name-style="western"><surname>Dhanda</surname><given-names>U</given-names> </name></person-group><article-title>Content validity of assessment instrument for employee engagement</article-title><source>SAGE Open</source><year>2019</year><month>01</month><volume>9</volume><issue>1</issue><pub-id pub-id-type="doi">10.1177/2158244018821751</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lynn</surname><given-names>MR</given-names> </name></person-group><article-title>Determination and quantification of content validity</article-title><source>Nurs Res</source><year>1986</year><volume>35</volume><issue>6</issue><fpage>382</fpage><lpage>385</lpage><pub-id pub-id-type="medline">3640358</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mattek</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Whalen</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Berkowitz</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Freeman</surname><given-names>JB</given-names> </name></person-group><article-title>Differential effects of cognitive load on subjective versus motor responses to ambiguously valenced facial expressions</article-title><source>Emotion</source><year>2016</year><month>09</month><volume>16</volume><issue>6</issue><fpage>929</fpage><lpage>936</lpage><pub-id pub-id-type="doi">10.1037/emo0000148</pub-id><pub-id pub-id-type="medline">27148846</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Menne-Lothmann</surname><given-names>C</given-names> </name><name name-style="western"><surname>Viechtbauer</surname><given-names>W</given-names> </name><name name-style="western"><surname>H&#x00F6;hn</surname><given-names>P</given-names> </name><etal/></person-group><article-title>How to boost positive interpretations? A meta-analysis of the effectiveness of cognitive bias modification for interpretation</article-title><source>PLoS One</source><year>2014</year><volume>9</volume><issue>6</issue><fpage>e100925</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0100925</pub-id><pub-id pub-id-type="medline">24968234</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burgess</surname><given-names>R</given-names> </name><name name-style="western"><surname>Culpin</surname><given-names>I</given-names> </name><name name-style="western"><surname>Costantini</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bould</surname><given-names>H</given-names> </name><name name-style="western"><surname>Nabney</surname><given-names>I</given-names> </name><name name-style="western"><surname>Pearson</surname><given-names>RM</given-names> </name></person-group><article-title>Quantifying the efficacy of an automated facial coding software using videos of parents</article-title><source>Front Psychol</source><year>2023</year><volume>14</volume><fpage>1223806</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2023.1223806</pub-id><pub-id pub-id-type="medline">37583610</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Powers</surname><given-names>DMW</given-names> </name></person-group><article-title>Face and facial expression recognition&#x2014;fusion based non negative matrix factorization</article-title><conf-name>Proceedings of the International Conference on Agents and Artificial Intelligence</conf-name><conf-date>Jan 10-12, 2015</conf-date><pub-id pub-id-type="doi">10.5220/0005216004260434</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnston</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Katsikitis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Carr</surname><given-names>VJ</given-names> </name></person-group><article-title>A generalised deficit can account for problems in facial emotion recognition in schizophrenia</article-title><source>Biol Psychol</source><year>2001</year><month>12</month><volume>58</volume><issue>3</issue><fpage>203</fpage><lpage>227</lpage><pub-id pub-id-type="doi">10.1016/s0301-0511(01)00114-4</pub-id><pub-id pub-id-type="medline">11698115</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gosselin</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kirouac</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dor&#x00E9;</surname><given-names>FY</given-names> </name></person-group><article-title>Components and recognition of facial expression in the communication of emotion by actors</article-title><source>J Pers Soc Psychol</source><year>1995</year><month>01</month><volume>68</volume><issue>1</issue><fpage>83</fpage><lpage>96</lpage><pub-id pub-id-type="doi">10.1037//0022-3514.68.1.83</pub-id><pub-id pub-id-type="medline">7861316</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Galati</surname><given-names>D</given-names> </name><name name-style="western"><surname>Scherer</surname><given-names>KR</given-names> </name><name name-style="western"><surname>Ricci-Bitti</surname><given-names>PE</given-names> </name></person-group><article-title>Voluntary facial expression of emotion: comparing congenitally blind with normally sighted encoders</article-title><source>J Pers Soc Psychol</source><year>1997</year><volume>73</volume><issue>6</issue><fpage>1363</fpage><lpage>1379</lpage><pub-id pub-id-type="doi">10.1037/0022-3514.73.6.1363</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scherer</surname><given-names>KR</given-names> </name><name name-style="western"><surname>Ellgring</surname><given-names>H</given-names> </name></person-group><article-title>Are facial expressions of emotion produced by categorical affect programs or dynamically driven by appraisal?</article-title><source>Emotion</source><year>2007</year><month>02</month><volume>7</volume><issue>1</issue><fpage>113</fpage><lpage>130</lpage><pub-id pub-id-type="doi">10.1037/1528-3542.7.1.113</pub-id><pub-id pub-id-type="medline">17352568</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sato</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hyniewska</surname><given-names>S</given-names> </name><name name-style="western"><surname>Minemoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yoshikawa</surname><given-names>S</given-names> </name></person-group><article-title>Facial expressions of basic emotions in Japanese laypeople</article-title><source>Front Psychol</source><year>2019</year><volume>10</volume><fpage>259</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2019.00259</pub-id><pub-id pub-id-type="medline">30809180</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Barrett</surname><given-names>LF</given-names> </name></person-group><article-title>How does this make you feel? A comparison of four affect induction procedures</article-title><source>Front Psychol</source><year>2014</year><volume>5</volume><fpage>689</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2014.00689</pub-id><pub-id pub-id-type="medline">25071659</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gross</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Levenson</surname><given-names>RW</given-names> </name></person-group><article-title>Emotion elicitation using films</article-title><source>Cogn Emot</source><year>1995</year><month>01</month><volume>9</volume><issue>1</issue><fpage>87</fpage><lpage>108</lpage><pub-id pub-id-type="doi">10.1080/02699939508408966</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="web"><article-title>Introducing GPT-5.2</article-title><source>OpenAI</source><access-date>2026-01-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/introducing-gpt-5-2/">https://openai.com/index/introducing-gpt-5-2/</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="web"><source>Grammarly</source><access-date>2025-03-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://app.grammarly.com/">https://app.grammarly.com/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Sensitivity analyses.</p><media xlink:href="ai_v5i1e81868_app1.docx" xlink:title="DOCX File, 30 KB"/></supplementary-material></app-group></back></article>