<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v5i1e87728</article-id>
      <article-id pub-id-type="pmid"/>
      <article-id pub-id-type="doi">10.2196/87728</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Participant-Aware Model Validation for Repeated-Measures Data: Comparative Cross-Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Luo</surname>
            <given-names>Gang</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Herdiman</surname>
            <given-names>Lobes</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Palama</surname>
            <given-names>Valentina</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hu</surname>
            <given-names>Yihan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Karbalaie</surname>
            <given-names>Abdolamir</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7320-2306</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Abtahi</surname>
            <given-names>Farhad</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <address>
            <institution>Department of Clinical Science, Intervention and Technology</institution>
            <institution>Karolinska Institutet</institution>
            <addr-line>Alfred Nobels Allé 8</addr-line>
            <addr-line>Huddinge, Stockholm, 14152</addr-line>
            <country>Sweden</country>
            <phone>46 8 524 838 01</phone>
            <email>farhad.abtahi@ki.se</email>
          </address>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7807-8682</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Häger</surname>
            <given-names>Charlotte K</given-names>
          </name>
          <degrees>Prof Dr Med</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0366-4609</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Community Medicine and Rehabilitation</institution>
        <institution>Umeå University</institution>
        <addr-line>Umeå, Västerbotten</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Biomedical Engineering and Health Systems, School of Engineering Sciences in Chemistry, Biotechnology and Health</institution>
        <institution>KTH Royal Institute of Technology</institution>
        <addr-line>Huddinge, Stockholm</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Clinical Science, Intervention and Technology</institution>
        <institution>Karolinska Institutet</institution>
        <addr-line>Huddinge, Stockholm</addr-line>
        <country>Sweden</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Clinical Physiology</institution>
        <institution>Karolinska University Hospital</institution>
        <addr-line>Stockholm</addr-line>
        <country>Sweden</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Farhad Abtahi <email>farhad.abtahi@ki.se</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>30</day>
        <month>4</month>
        <year>2026</year>
      </pub-date>
      <volume>5</volume>
      <elocation-id>e87728</elocation-id>
      <history>
        <date date-type="received">
          <day>13</day>
          <month>11</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>31</day>
          <month>3</month>
          <year>2026</year>
        </date>
        <date date-type="rev-recd">
          <day>8</day>
          <month>4</month>
          <year>2026</year>
        </date>
        <date date-type="accepted">
          <day>8</day>
          <month>4</month>
          <year>2026</year>
        </date>
      </history>
      <copyright-statement>©Abdolamir Karbalaie, Farhad Abtahi, Charlotte K Häger. Originally published in JMIR AI (https://ai.jmir.org), 30.04.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2026/1/e87728" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Repeated-measures datasets are common in biomechanics and digital health, where each participant contributes multiple correlated trials. If cross-validation (CV) ignores this structure, information can leak from training to test folds, inflating performance and undermining clinical credibility.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study evaluates the impact of participant-aware validation strategies on model reliability in repeated-measures classification tasks, using fear of reinjury prediction following anterior cruciate ligament reconstruction (ACLR) as a case study.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We analyzed 623 hop trials from 72 individuals after ACLR to classify fear of reinjury based on biomechanical features. Four CV strategies were compared: stratified 10-fold CV, leave-one-participant-out cross-validation (LOPOCV), group 3-fold CV, and a nested framework combining LOPOCV (outer loop) with group 3-fold CV (inner loop). Ten supervised classifiers were benchmarked across classification accuracy, train-test generalization gap, model ranking consistency, and computational efficiency.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Stratified 10-fold CV systematically overestimated model performance (eg, extra trees accuracy of 0.91 vs 0.66 under LOPOCV) due to participant-level data leakage. Group and nested CV strategies yielded more conservative and stable estimates. The nested LOPOCV + group CV framework achieved a good balance between generalization and participant-aware separation, with reduced bias and overfitting compared with nonnested alternatives.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Participant-aware validation strategies are essential for trustworthy machine learning (ML) evaluation in repeated-measures settings. Nested CV designs improve reproducibility, reduce selection bias, and align with regulatory expectations for clinical ML tools. These findings support best practices in model validation for biomechanics and digital health applications.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>data leakage prevention</kwd>
        <kwd>machine learning validation</kwd>
        <kwd>human movement control</kwd>
        <kwd>model selection bias</kwd>
        <kwd>cross-validation benchmarking</kwd>
        <kwd>transparent AI evaluation</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Machine learning (ML) is becoming increasingly central to analyzing complex human data gathered through repeated trials, particularly in biomechanics, clinical assessment, and digital health contexts. These datasets, often comprising multiple sessions or sensor recordings per participant, provide insight into within-person variability and strengthen longitudinal evaluations. In biomechanics, repeated gait and hop assessments quantify within-person variability and recovery [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Similar repeated designs track performance shifts in sports contexts [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>In clinical and behavioral research, accounting for repeated measures is crucial for making valid inferences. Powell et al [<xref ref-type="bibr" rid="ref4">4</xref>] applied repeated-measures analysis of covariance to assess how breast support influences running biomechanics while controlling for within-participant variation. Similarly, Keogh et al [<xref ref-type="bibr" rid="ref5">5</xref>] used repeated-measures correlation to separate individual-level patterns from group-level effects in a cohort of athletes. These examples highlight the necessity of modeling within-participant structures to draw meaningful conclusions.</p>
      <p>With the rise of wearable technologies and sensor-based monitoring, repeated-measures data have become increasingly prevalent in applications ranging from cardiac diagnostics to musculoskeletal load tracking [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. When ML evaluation ignores participant identity, trial-level dependencies can leak across folds, inflating performance and undermining generalizability [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. Despite broad awareness of this risk, stratified K-fold CV is still commonly used in ways that overlook subject boundaries. This creates data leakage, as correlated trials from the same participant can appear in both training and test sets. Studies in neuroimaging, ophthalmology, and sensor-based health applications have reported this pattern [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Our companion standardized rebound side hops (SRSH) study addressed the same clinical construct using the entire biomechanical time series with a 1D convolutional neural network under leave-one-participant-out cross-validation (LOPOCV) [<xref ref-type="bibr" rid="ref14">14</xref>]. That analysis demonstrated that participant-wise evaluation is crucial for credible accuracy on repeated hop trials. That study held model hyperparameters fixed across folds; here, we systematize the validation question by comparing participant-aware CV designs and, when tuning is required, separating it from evaluation.</p>
      <p>Participant-aware validation strategies such as LOPOCV and group K-fold CV have been proposed to enforce participant-level separation during evaluation [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. However, these strategies differ in how they handle intraparticipant variation during hyperparameter tuning, and simple (nonnested) implementations can still introduce selection bias. Nested CV frameworks, which decouple tuning from evaluation, offer a principled alternative to reduce selection bias and overfitting, but adoption remains limited in clinical and behavioral ML [<xref ref-type="bibr" rid="ref8">8</xref>]. Crucially, the field lacks a systematic benchmark that goes beyond accuracy to quantify (1) validation bias, (2) train-test gaps, (3) participant-level rank stability, and (4) computational efficiency across commonly used models—limitations that constrain reproducibility and practical deployment.</p>
      <p>This study addresses this need by providing a systematic, multidimensional benchmark of cross-validation (CV) strategies for repeated-measures clinical ML data. We compare 4 CV designs, including a fully nested framework with LOPOCV for outer-loop evaluation and group K-fold (K=3) for inner-loop model selection, across 10 classifiers and the 4 evaluation axes listed above. The anterior cruciate ligament reconstruction (ACLR) fear-of-reinjury classification task serves as a representative case study; the primary contribution is the validation methodology, which generalizes to other repeated-measures clinical settings. The study follows the TRIPOD+AI (Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis—Artificial Intelligence extension) guidelines [<xref ref-type="bibr" rid="ref17">17</xref>], and the completed checklist is provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The following research questions were posed to guide the investigation:</p>
      <list list-type="bullet">
        <list-item>
          <p>Does standalone LOPOCV overestimate performance by neglecting within-participant variation during tuning?</p>
        </list-item>
        <list-item>
          <p>Can group K-fold (K=3) CV improve model selection by leveraging repeated trials?</p>
        </list-item>
        <list-item>
          <p>Does a nested LOPOCV (outer) + group K-fold CV (inner) strategy yield more stable and realistic performance estimates for repeated-measures classification?</p>
        </list-item>
      </list>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Cohort</title>
        <p>This study included 72 individuals (43 males and 29 females; age: mean 28.4 years, SD 6.7 years), all between 6 and 24 months after unilateral ACLR using a hamstring autograft.</p>
      </sec>
      <sec>
        <title>Ethics Considerations</title>
        <p>This study was approved by the Regional Ethical Review Board in Umeå, Sweden (approval numbers 2015/67-31 and 2021-03860). All participants provided written informed consent before participation. Participant data were deidentified before analysis and handled in accordance with applicable Swedish regulations on privacy, confidentiality, and patient safety. Participants did not receive any compensation.</p>
      </sec>
      <sec>
        <title>Biomechanical Data Collection</title>
        <p>We used the SRSH test to evaluate lateral dynamic stability, neuromuscular control, and lower-limb coordination under sport-specific plyometric loading [<xref ref-type="bibr" rid="ref18">18</xref>]. During each trial, participants performed single-leg lateral hops across 2 adjacent force plates, with an emphasis on controlled landings and immediate rebounds.</p>
        <p>Each participant completed between 5 and 10 valid SRSH trials under standardized protocol conditions. Three-dimensional motion capture was conducted using an 8-camera optical system (Qualisys Oqus 300, 240 Hz), synchronized with dual force platforms (Kistler 9286AA, 1200 Hz). A 56-marker full-body setup captured segmental joint motion.</p>
      </sec>
      <sec>
        <title>Data Preprocessing</title>
        <p>Marker trajectories were processed in Qualisys Track Manager (version 2019.3) using a zero-lag, fourth-order low-pass Butterworth filter (12 Hz for markers; 50 Hz for force data). The filtered data were exported to Visual3D (C-Motion Inc) for calculation of joint angles and moments via inverse dynamics. All kinetic variables were normalized to body mass.</p>
        <p>The stance phase—defined from initial ground contact to the lowest vertical position of the center of mass—was segmented and time-normalized to 101 points using third-order polynomial fitting. Trial-level metadata, including participant identifiers <italic>p<sub>i</sub></italic> and trial indices <italic>t<sub>i</sub></italic>, were retained for participant-aware modeling and validation.</p>
      </sec>
      <sec>
        <title>Target Labeling: Fear of Reinjury</title>
        <p>After preprocessing, fear of reinjury was assessed using item 9 of the 17-item Tampa Scale of Kinesiophobia, which states “I am afraid that I might injure myself accidentally.” Following the classification procedure proposed by Markström et al [<xref ref-type="bibr" rid="ref19">19</xref>], participants who responded “agree” or “strongly agree” were categorized as having high fear (n=36 participants, 301 trials), and those who responded “disagree” or “strongly disagree” were categorized as having low fear (n=36 participants, 322 trials). These binary categories served as target labels for the classification task.</p>
      </sec>
      <sec>
        <title>Feature Engineering and Selection</title>
        <p>Time-series descriptors were generated with TSFRESH [<xref ref-type="bibr" rid="ref20">20</xref>] from 48 biomechanical signals (angles and moments; lateral/medial landings). We retained 13 summary statistics per signal (mean, SD, variance, skewness, kurtosis, 10th/50th/90th percentiles, autocorrelation at lags 1 and 2, counts above/below the mean, and absolute sum of changes), yielding 624 features per trial.</p>
        <p>TSFRESH’s built-in hypothesis tests were applied at α=.05 with false-discovery control to remove noninformative descriptors. Surviving features were ranked by Gradient Boosting Classifier (GBC) importance, computed on the pooled training data used for feature preparation. The ranking produced top-k subsets (top 5-10 [step size 1], then 20-100). To keep inputs compact, stable, and fast to evaluate—and to avoid mixing feature selection with performance reporting—we fixed k=10 a priori and used the corresponding top-10 set for all CV experiments. This choice reflects diminishing returns beyond small k on our development trials and keeps the focus on the validation design rather than on feature-count tuning. Importantly, the top-10 feature set was determined before any CV experiment began and was held constant across all folds and strategies. The features were selected using domain-relevant biomechanical reasoning and a single, preexperimental GBC ranking, not through a data-driven search repeated within each fold. As a result, no information from test folds influenced the feature set, eliminating feature-selection bias from the validation comparison. Correlated features (eg, bilateral joint angles) were intentionally retained, as the goal was to evaluate validation-strategy effects under a fixed, realistic feature set rather than to optimize feature independence.</p>
        <p>The top-10 ranked features (with signal/statistic provenance) are listed in <xref ref-type="table" rid="table1">Table 1</xref> and were held fixed across all models and validation schemes. Within each fold, these features were z-scaled (using StandardScaler from scikit-learn) on the training split and then applied to the corresponding test split. Feature ranks are documented to specify model inputs for the validation comparison. They are not advanced as biomarker evidence; this study aimed to examine participant-aware evaluation and performance inflation when participant identity is ignored. An overview of this feature pipeline and its integration with the evaluation design is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Summary of the top 10 selected features for classifying fear of reinjury<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="190"/>
            <col width="320"/>
            <col width="120"/>
            <col width="370"/>
            <thead>
              <tr valign="top">
                <td>Joint</td>
                <td>Plane</td>
                <td>Side</td>
                <td>Statistic</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Ankle</td>
                <td>Supination/pronation</td>
                <td>Medial</td>
                <td>Absolute sum of changes</td>
              </tr>
              <tr valign="top">
                <td>Hip</td>
                <td>Rotation</td>
                <td>Medial</td>
                <td>Mean</td>
              </tr>
              <tr valign="top">
                <td>Ankle</td>
                <td>Rotation</td>
                <td>Medial</td>
                <td>90th percentile</td>
              </tr>
              <tr valign="top">
                <td>Trunk</td>
                <td>Lean (lateral flexion)</td>
                <td>Lateral</td>
                <td>Kurtosis</td>
              </tr>
              <tr valign="top">
                <td>Ankle</td>
                <td>Flexion/extension</td>
                <td>Lateral</td>
                <td>10th percentile</td>
              </tr>
              <tr valign="top">
                <td>Knee</td>
                <td>Rotation</td>
                <td>Lateral</td>
                <td>Skewness</td>
              </tr>
              <tr valign="top">
                <td>Hip moment</td>
                <td>Rotation</td>
                <td>Medial</td>
                <td>Absolute sum of changes</td>
              </tr>
              <tr valign="top">
                <td>Knee</td>
                <td>Rotation</td>
                <td>Medial</td>
                <td>10th percentile</td>
              </tr>
              <tr valign="top">
                <td>Hip</td>
                <td>Adduction/abduction</td>
                <td>Lateral</td>
                <td>Median (50th percentile)</td>
              </tr>
              <tr valign="top">
                <td>Knee</td>
                <td>Adduction/abduction</td>
                <td>Medial</td>
                <td>Median (50th percentile)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Joint-level sources, anatomical planes, body side, and statistical descriptors were extracted from side hop rebound trials. Features were selected based on the Gradient Boosting Classifier importance ranking after univariate filtering.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Study framework for participant-aware validation. Raw kinematic and kinetic time-series data are first summarized using TSFRESH feature extraction. Features are then screened with statistical tests applying false discovery rate (FDR) correction, ranked using a Gradient Boosting Classifier (GBC), and selected at multiple thresholds for subsequent model training and evaluation. Panel labels indicate category (contribution, prior practice, commonly used, problematic). Colors are supplemented with text labels to ensure readability. ET: extra trees; KNN: k-nearest neighbors; LDA: linear discriminant analysis; LOPOCV: leave-one-out cross-validation; LR: logistic regression; QDA: quadratic discriminant analysis; RF: random forest.</p>
          </caption>
          <graphic xlink:href="ai_v5i1e87728_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Machine Learning Algorithms and Hyperparameter Tuning</title>
        <sec>
          <title>Classifiers</title>
          <p>Ten supervised classification models were selected based on their frequent use in biomechanical and clinical ML tasks, representing a range of algorithmic families. These included 3 linear models—logistic regression (LR), linear discriminant analysis (LDA), and quadratic discriminant analysis (QDA); 1 instance-based method—k-nearest neighbors (KNN); and 6 ensemble-based classifiers—random forest (RF), extra trees (ET), AdaBoost (ADA), GBC, Extreme Gradient Boosting (XGBoost), and Light Gradient Boosting Machine (LGBM).</p>
        </sec>
        <sec>
          <title>Hyperparameter Handling and Nested Validation</title>
          <p>Two hyperparameter strategies were implemented: a fixed configuration using default settings and a tuned configuration using nested CV. In all single-level baselines (stratified 10-fold, group K-fold [K=3], LOPOCV), models used the fixed hyperparameters listed in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (right column). Only the nested design performed grid search in the inner loop; no tuning ever used outer-test data. In the fixed configuration, each model was initialized with its default hyperparameters (via a common build_models function), and no grid search was performed. This fixed-parameter setup was used for the 3 single-level CV schemes noted above.</p>
          <p>By contrast, the tuned configuration used nested CV, wherein group K-fold (K=3) served as the inner loop for hyperparameter optimization and LOPOCV as the outer loop for evaluation. We chose K=3 to balance 2 constraints. First, each inner training fold required a sufficient number of participants for stable model fitting (approximately 48 of the 71 remaining participants per outer fold). Second, computational cost scales linearly with K in the inner loop and multiplicatively with the 72-fold outer LOPOCV. Hyperparameter tuning was performed using GridSearchCV (scikit-learn) across predefined parameter grids (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For each outer fold (held-out participant), the best hyperparameters (denoted θ*) were selected based on inner-loop validation accuracy. A final model was then retrained on the entire outer training set using θ* and evaluated once on the held-out outer participant. This design cleanly decouples tuning from evaluation, minimizing selection bias.</p>
        </sec>
        <sec>
          <title>Bias-Variance Handling</title>
          <p>In the nested configuration, the inner-loop grid spans the parameters that govern model capacity (bias-variance trade-off). Selection maximizes macro-<italic>F</italic><sub>1</sub>-score averaged across inner folds; when 2 settings are statistically indistinguishable (Δ≤0.5 pp), ties are broken in favor of the simpler configuration (lower capacity) to reduce variance. We quantify variance at evaluation time via outer-fold dispersion across participants and participant-level rank stability.</p>
        </sec>
      </sec>
      <sec>
        <title>CV Methodology</title>
        <sec>
          <title>Notation and Definitions</title>
          <p>To formalize our dataset and modeling pipeline, we define the following: Let the dataset be denoted as <italic>D</italic> = {(<italic>x<sub>i</sub></italic>, <italic>y<sub>i</sub></italic>, <italic>p<sub>i</sub></italic>, <italic>t<sub>i</sub></italic>)}<italic><sup>N</sup><sub>i</sub></italic><sub>=1</sub>, where <italic>x<sub>i</sub></italic> is the feature vector; <italic>y<sub>i</sub></italic> ∈ {0, 1} is the binary class label; represents the predicted class label for trial <italic>i</italic>; <italic>p<sub>i</sub></italic> ∈ <italic>P</italic> is the participant identifier; and <italic>t<sub>i</sub></italic> is the trial number for the <italic>i</italic>th observation within participant <italic>p<sub>i</sub></italic>.</p>
          <p>Let <italic>P</italic> be the set of unique participant IDs with cardinality |<italic>P</italic>|, and <italic>D<sub>p</sub></italic> = {(<italic>x<sub>i</sub></italic>, <italic>y<sub>i</sub></italic>, <italic>t<sub>i</sub></italic>)} | <italic>p<sub>i</sub></italic> = <italic>p</italic>} be the set of all trials for participant <italic>p</italic>.</p>
          <p>We define <italic>M</italic> as an ML model instance; <italic>θ</italic> ∈ Θ as a specific hyperparameter configuration; Θ as the hyperparameter search space; Train(<italic>M</italic>, <italic>D</italic><sub>train</sub>) as the training procedure; Eval(<italic>M</italic>, <italic>D</italic><sub>test</sub>) as the evaluation procedure (eg, accuracy, <italic>F</italic><sub>1</sub>-score); and Metric(·) as any function computing a classification metric from predicted and true labels.</p>
        </sec>
        <sec>
          <title>Stratified K-Fold CV</title>
          <p>In stratified K-fold CV, the full dataset D is randomly divided into k approximately equal folds while preserving the overall class distribution. We use the stratified variant to maintain class prevalence within each fold; this is not a rebalancing method but a splitting rule that preserves label proportions while remaining trial-wise. For each fold <italic>j</italic> ∈ {1, ..., <italic>k</italic>}, the data are partitioned into a training set <italic>D</italic><sup>(</sup><italic><sup>j</sup></italic><sup>)</sup><sub>train</sub> and a test set <italic>D</italic><sup>(</sup><italic><sup>j</sup></italic><sup>)</sup><sub>test</sub>. The model is trained on <italic>D</italic><sup>(</sup><italic><sup>j</sup></italic><sup>)</sup><sub>train</sub>, evaluated on <italic>D</italic><sup>(</sup><italic><sup>j</sup></italic><sup>)</sup><sub>test</sub>, and the process is repeated for all folds. Final performance is computed as the average across all folds.</p>
          <p>We used StratifiedKFold (k=10, shuffle=True, random_state=42) from scikit-learn. While commonly used, this method does not control for participant identity and may introduce data leakage in repeated-measures datasets (see <xref rid="figure2" ref-type="fig">Figure 2</xref>A). The full pseudocode is provided in Algorithm S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Panels compare 4 evaluation schemes while holding participants as the grouping unit. Tiles represent trials (or segments), colors indicate class labels, and vertical dividers mark participant boundaries. (A) In Trial-wise stratified K-Fold, folds are stratified by label at the trial level, so trials from the same participant may appear in both training and test sets (risk of data leakage). (B) In group K-fold, folds are formed at the participant level, with all trials from a given participant assigned to a single fold, preventing overlap between training and test sets. (C) In leave-one-participant-out (LOPOCV), 1 participant is held out as the test set while the remaining participants form the training set, and this process is repeated across participants. (d) In nested LOPOCV (outer) with group K-fold (inner), the outer loop follows LOPOCV, and within each outer training split, an inner group K-fold is used for hyperparameter tuning; only performance on the outer test sets is reported. Dashed boxes denote outer folds, solid boxes denote inner folds, and arrows illustrate the flow from training to tuning to testing. Vertical grid lines indicate participant boundaries (P1-P10), and the inset shows repeated trials (T1-T4). White indicates training set; green/blue indicates test set for the respective loop. CV: cross-validation.</p>
            </caption>
            <graphic xlink:href="ai_v5i1e87728_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Leave-One-Participant-Out Cross-Validation</title>
          <p>LOPOCV evaluates true interparticipant generalization by holding out all trials from a single participant in each fold. For each participant <italic>p</italic> ∈ <italic>P</italic>, the model is trained on the dataset <italic>D</italic><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup><sub>train</sub> = <italic>D</italic>\<italic>D<sub>p</sub></italic>, where <italic>D<sub>p</sub></italic> contains all trials from participant <italic>p</italic>. The trained model is then evaluated on <italic>D<sub>p</sub></italic>, ensuring no overlap between training and testing data.</p>
          <p>We implemented LOPOCV using LeaveOneGroupOut from scikit-learn, treating participant ID as the grouping variable. This method enforces participant-level separation but does not address selection bias during hyperparameter tuning (see <xref rid="figure2" ref-type="fig">Figure 2</xref>B). An explicit step-by-step description is provided in Algorithm S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, including participant-averaged metrics and train-test gap calculation.</p>
        </sec>
        <sec>
          <title>Group K-Fold CV</title>
          <p>Group K-fold (K=3) CV preserves participant-level independence by splitting the set of participants <italic>P</italic> into <italic>k</italic> disjoint subsets <italic>P</italic><sub>1</sub>, ..., <italic>P<sub>k</sub></italic>. For each fold <italic>j</italic>, all data from participants in the group <italic>P<sub>j</sub></italic> form the test set <italic>D</italic><sup>(</sup><italic><sup>j</sup></italic><sup>)</sup><sub>test</sub>, while the remaining data constitute the training set <italic>D</italic><sup>(</sup><italic><sup>j</sup></italic><sup>)</sup><sub>train</sub>. The model is trained and evaluated accordingly.</p>
          <p>We used GroupKFold (n_splits=3) to ensure that all trials from each participant remained in the same fold. This method maintains participant-level separation during training and testing while allowing multiple participants per fold. Compared with LOPOCV, group K-fold (K=3) offers greater computational efficiency but does not fully isolate tuning from evaluation (see <xref rid="figure2" ref-type="fig">Figure 2</xref>C). Implementation details are provided in Algorithm S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        </sec>
        <sec>
          <title>Combined Approach: Nested LOPOCV (Outer) With Group K-Fold CV (Inner)</title>
          <p>To support robust hyperparameter tuning without introducing selection bias, we implemented a nested CV framework. The outer loop used LOPOCV, while the inner loop used group K-fold (K=3) CV for tuning.</p>
          <p>In the outer loop, for each participant <italic>p</italic>, the test set <italic>D</italic><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup><sub>outer_test</sub> comprised all trials from that participant. The remaining data <italic>D</italic><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup><sub>outer_train</sub>=<italic>D</italic>\<italic>D<sub>p</sub></italic> was used for training and tuning. Within this training set, participants <italic>P</italic>\{<italic>p</italic>} were split into 3 groups for the inner loop.</p>
          <p>The inner loop performed a 3-fold CV across these remaining participants. For each inner fold <italic>j</italic>, the training and validation sets were defined as follows:</p>
          <list list-type="bullet">
            <list-item>
              <p>Inner validation set:</p>
              <p><italic>D</italic><sub>val</sub><sup>(<italic>p,j</italic>)</sup> = ∪<sub><italic>p′∈p(j)</italic></sub><italic>D</italic><sub><italic>p′</italic></sub>
</p>
            </list-item>
            <list-item>
              <p>Inner training set:</p>
              <p><italic>D</italic><sub>train</sub><sup>(<italic>p,j</italic>)</sup> = <italic>D</italic><sub>outer_train</sub><sup>(<italic>p</italic>)</sup>\<italic>D</italic><sub>val</sub><sup>(<italic>p,j</italic>)</sup></p>
            </list-item>
          </list>
          <p>Each model <italic>M</italic><sub>θ</sub><sup>(</sup><italic><sup>p</sup></italic><sup>,</sup><italic><sup>j</sup></italic><sup>)</sup> was trained with a candidate hyperparameter configuration <italic>θ</italic> ∈ Θ and evaluated on <italic>D</italic><sub>val</sub><sup>(</sup><italic><sup>p,j</sup></italic><sup>)</sup>. The optimal configuration <italic>θ<sub>p</sub>*</italic> was selected based on mean inner validation performance:</p>
          <graphic xlink:href="ai_v5i1e87728_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          <p>Using <italic>θ<sub>p</sub>*</italic>, the final model <italic>M</italic><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup> was trained on <italic>D</italic><sub>outer_train</sub><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup> and evaluated on <italic>D</italic><sub>outer_test</sub><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup>.</p>
          <p>This nested design ensures that hyperparameter tuning is fully decoupled from performance estimation by isolating inner tuning folds from outer evaluation folds. <xref rid="figure2" ref-type="fig">Figure 2</xref>D illustrates the complete nested validation structure. The nested, participant-aware framework is specified in Algorithm S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <p>The hyperparameter grids used in this process were deliberately designed to be capacity-aware, enabling the inner CV to explicitly explore low-bias/high-variance versus high-bias/low-variance regimes (see Tables S2 and S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for the capacity-controlling parameters).</p>
        </sec>
      </sec>
      <sec>
        <title>Performance Evaluation Metrics</title>
        <p>We assessed model performance using standard classification metrics derived from the confusion matrix, based on counts of true positives (TPs), true negatives (TNs), false positives (FPs), and false negatives (FNs). The following metrics were computed for each outer test fold in every CV scheme:</p>
        <disp-formula>Accuracy = (TP + TN)/(TP + TN + FP + FN)</disp-formula>
        <disp-formula>Precision = (TP)/(TP + FP)</disp-formula>
        <disp-formula>Recall = (TP)/(TP + FN)</disp-formula>
        <disp-formula><italic>F</italic><sub>1</sub>-score = 2 × (precision × recall)/(precision + recall)</disp-formula>
        <graphic xlink:href="ai_v5i1e87728_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>We calculated these metrics from the test-fold predictions of the outer loops for all 4 CV schemes: stratified K-fold, LOPOCV, group K-fold (K=3), and nested LOPOCV.</p>
      </sec>
      <sec>
        <title>Participant Aware Summary Metrics</title>
        <sec>
          <title>Summary Metrics for Participant-Aware Validation Strategies</title>
          <p>For participant-aware strategies (LOPOCV and nested LOPOCV), we computed 2 complementary summaries:</p>
        </sec>
        <sec>
          <title>Mean LOPOCV Performance (Participant-Averaged)</title>
          <p>This metric averages model performance across all participants, giving equal weight to each individual, regardless of the number of trials:</p>
          <graphic xlink:href="ai_v5i1e87728_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          <p>where <italic>P</italic> is the total number of participants; and <italic>D</italic><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup><sub>test</sub> denotes the outer test set for participant <italic>p</italic>.</p>
        </sec>
        <sec>
          <title>Overall Performance (Aggregated Level)</title>
          <p>This summary pools all test predictions and true labels across outer folds and then computes each performance metric on the combined dataset:</p>
          <disp-formula>Overall metric = Metric(∪<italic><sub>p</sub></italic><sub>=1</sub><italic><sup>P</sup></italic>{<italic>i</italic> ∈ <italic>D</italic><sub>test</sub><sup>(</sup><italic><sup>p</sup></italic><sup>)</sup>})</disp-formula>
          <p>The mean LOPOCV performance emphasizes cross-participant stability, whereas overall performance reflects population-level predictive accuracy by weighting participants according to trial count.</p>
        </sec>
      </sec>
      <sec>
        <title>Train-Test Gap: Generalization Stability</title>
        <p>To assess generalization stability and potential overfitting, we computed the train-test gap for each outer fold in the LOPOCV and nested CV frameworks. This gap quantifies the difference in model accuracy between the outer training and test sets.</p>
        <p>Let <italic>θ</italic>* ∈ Θ denote the selected hyperparameter configuration—either default (in the fixed configuration) or optimized via inner-loop group K-fold tuning (in nested CV). Let <italic>M<sub>θ</sub></italic><sub>*</sub> be the trained model using this configuration. For each outer fold <italic>f</italic>, corresponding to the held-out participant <italic>p</italic>, the train-test gap is defined as follows:</p>
        <disp-formula>Gap<sup>(</sup><italic><sup>f</sup></italic><sup>)</sup> = Accuracy[<italic>M<sub>θ</sub></italic><sub>*</sub>, <italic>D</italic><sup>(</sup><italic><sup>f</sup></italic><sup>)</sup><sub>train</sub>] – Accuracy[<italic>M<sub>θ</sub></italic><sub>*</sub>, <italic>D</italic><sup>(</sup><italic><sup>f</sup></italic><sup>)</sup><sub>test</sub>]</disp-formula>
        <p>This metric quantifies how well a model’s training performance carries over to unseen test participants. Large gaps indicate potential overfitting, whereas small gaps suggest stable generalization.</p>
        <p>To summarize overall generalization behavior, we report the mean and SD of the train-test gap across all participants for each model m:</p>
        <graphic xlink:href="ai_v5i1e87728_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>This metric reflects how well training performance translates to unseen test participants. We did not compute the train-test gap for stratified K-fold or group K-fold CV, as both methods allow trials from the same participant to appear in both the training and test sets, thereby violating independence and compromising the interpretability of generalization behavior.</p>
      </sec>
      <sec>
        <title>Bias Estimation Across Validation Strategies</title>
        <p>To quantify how CV strategies deviate from participant-aware evaluation, we compared strategy-level summary metrics with a participant-aware baseline. The baseline reference was either LOPOCV or nested LOPOCV, as indicated in each comparison. For each metric <italic>m</italic> ∈ {accuracy, <italic>F</italic><sub>1</sub>-score, precision, recall, Matthews correlation coefficient}, bias was computed as follows:</p>
        <graphic xlink:href="ai_v5i1e87728_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>where <inline-graphic xlink:href="ai_v5i1e87728_fig10.png" xlink:type="simple" mimetype="image"/> represents the participant-averaged or overall score (as specified) obtained using stratified 10-fold or group K-fold (K=3) CV; and <inline-graphic xlink:href="ai_v5i1e87728_fig11.png" xlink:type="simple" mimetype="image"/> represents the corresponding LOPOCV- or nested LOPOCV–based estimate.</p>
        <p>A positive bias implies that the strategy overestimates model performance relative to the participant-aware baseline, whereas a negative bias indicates underestimation. To quantify overall deviation across metrics, we also computed the mean absolute bias (MAB) for each model:</p>
        <graphic xlink:href="ai_v5i1e87728_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        <p>MAB reflects the average magnitude of deviation from the baseline, providing a scalar summary of validation reliability. Larger MAB values indicate greater deviation from a realistic participant-aware estimate and, thus, lower external validity. Lower MAB values indicate that the CV strategy yields results more consistent with robust, participant-separated evaluation.</p>
        <p>This procedure is consistent with earlier findings that participant-independent validation frameworks, such as LOPOCV, produce more accurate out-of-sample performance estimates, particularly in domains involving interindividual variability [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
      <sec>
        <title>Model Ranking Method</title>
        <sec>
          <title>Overview</title>
          <p>To evaluate the relative performance of classifiers and assess ranking stability across individuals, we implemented a nonparametric model-ranking procedure, following established practices in classifier evaluation [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p>
          <p>Let <italic>A<sub>p,m</sub></italic> denote the classification accuracy of the model <italic>m</italic> for participant <italic>p</italic>, computed using either LOPOCV or nested LOPOCV. The ranking procedure involved 2 steps.</p>
        </sec>
        <sec>
          <title>Step 1: Within-Participant Ranking</title>
          <p>For each participant <italic>p</italic>, all models were ranked based on their accuracy <italic>A<sub>p,m</sub></italic>, from highest (rank=1) to lowest (rank=10). Ties were resolved by assigning the average of the spanned ranks.</p>
        </sec>
        <sec>
          <title>Step 2: Across-Participant Summary Statistics</title>
          <p>For each model <italic>m</italic>, we computed the following summary metrics over all participants:</p>
          <graphic xlink:href="ai_v5i1e87728_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          <p>Lower values indicate better average performance across participants.</p>
          <graphic xlink:href="ai_v5i1e87728_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          <p>The aforesaid equation reflects consistency of ranking across participants.</p>
          <graphic xlink:href="ai_v5i1e87728_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          <p>Models with lower mean ranks and smaller rank variability were considered more robust and consistent across individuals. Ranking analyses were conducted independently for LOPOCV and nested CV frameworks to evaluate how hyperparameter tuning influences ranking stability.</p>
        </sec>
      </sec>
      <sec>
        <title>Computational Efficiency Estimation</title>
        <sec>
          <title>Computational Efficiency Metrics for Model Deployment Feasibility</title>
          <p>To assess the feasibility of deploying each model configuration, we evaluated computational efficiency using 3 metrics recorded during outer-loop validation:</p>
        </sec>
        <sec>
          <title>Training Time Per Outer Fold (Seconds)</title>
          <p>Training time per outer fold is defined as the elapsed wall-clock time required to train each model on the outer training set. For nested LOPOCV, this measure includes the full runtime of the inner-loop hyperparameter optimization using GridSearchCV. For LOPOCV, group K-fold (K=3) CV, and 10-fold CV, only the time required for model training (without tuning) was recorded.</p>
        </sec>
        <sec>
          <title>Inference Time Per Sample (Milliseconds)</title>
          <p>Inference time per sample is computed as the average time required to generate a single prediction. This value was averaged over all test samples across outer folds. It indicates the suitability of models for real-time or near–real-time applications.</p>
        </sec>
        <sec>
          <title>Model Size (Megabytes)</title>
          <p>Model size is estimated as the in-memory serialized size of each trained model instance using pickle.dumps(). This metric reflects storage requirements and transferability for deployment in clinical or embedded systems.</p>
          <p>All efficiency metrics were automatically logged during model evaluation for each model-CV combination. Comparisons across configurations enable assessment of trade-offs among performance, complexity, and ease of deployment.</p>
        </sec>
      </sec>
      <sec>
        <title>Computational Environment and Reproducibility</title>
        <p>All analyses were performed using Python 3.10.12 (Python Foundation) on a high-performance workstation equipped with an Intel Core i9-13900K CPU, an NVIDIA GeForce RTX 4090 GPU, and 64 GB of DDR5 RAM. The software environment included scikit-learn version 1.2.2 [<xref ref-type="bibr" rid="ref25">25</xref>], XGBoost version 1.4.2 [<xref ref-type="bibr" rid="ref26">26</xref>], LGBM version 4.3.0 [<xref ref-type="bibr" rid="ref27">27</xref>], and MLflow 2.10.2 for experiment tracking and reproducibility.</p>
        <p>All training runs, CV folds, hyperparameter configurations, and resulting artifacts were managed and logged using MLflow, which provided full experiment lineage and enabled structured comparison across model-validation combinations. Each run recorded parameter settings, metric scores, training durations, and serialized model outputs, enabling seamless integration with the reported analysis.</p>
        <p>Randomized operations, including data shuffling, CV splitting, and model initialization, were controlled using a fixed seed (random_state=42). All splitters and estimators were instantiated with this seed where applicable; CV groups were specified as participant IDs to enforce participant integrity.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Model Performance Across Validation Strategies</title>
        <p><xref ref-type="table" rid="table2">Table 2</xref> presents overall and participant-averaged accuracy for each classifier across 4 validation strategies. Performance varied by model type and validation scheme.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Model performance across validation strategies.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="210"/>
            <col width="140"/>
            <col width="120"/>
            <col width="150"/>
            <col width="170"/>
            <col width="120"/>
            <col width="90"/>
            <thead>
              <tr valign="bottom">
                <td>Model</td>
                <td>LOPOCV<sup>a</sup> (overall)</td>
                <td>LOPOCV, mean (SD)<sup>b</sup></td>
                <td>Nested CV<sup>c</sup> (overall)</td>
                <td>Nested CV, mean (SD)<sup>b</sup></td>
                <td>Group CV (SD)</td>
                <td>10-fold CV</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>k-Nearest neighbors</td>
                <td>0.69</td>
                <td>0.69 (0.29)</td>
                <td>0.68</td>
                <td>0.68 (0.30)</td>
                <td>0.65 (0.03)</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.64</td>
                <td>0.64 (0.35)</td>
                <td>0.64</td>
                <td>0.64 (0.35)</td>
                <td>0.63 (0.03)</td>
                <td>0.7</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost</td>
                <td>0.64</td>
                <td>0.63 (0.34)</td>
                <td>0.63</td>
                <td>0.61 (0.34)</td>
                <td>0.63 (0.03)</td>
                <td>0.77</td>
              </tr>
              <tr valign="top">
                <td>Linear discriminant analysis</td>
                <td>0.63</td>
                <td>0.63 (0.35)</td>
                <td>0.63</td>
                <td>0.63 (0.35)</td>
                <td>0.63 (0.03)</td>
                <td>0.7</td>
              </tr>
              <tr valign="top">
                <td>Quadratic discriminant analysis</td>
                <td>0.66</td>
                <td>0.66 (0.34)</td>
                <td>0.66</td>
                <td>0.66 (0.35)</td>
                <td>0.67 (0.03)</td>
                <td>0.77</td>
              </tr>
              <tr valign="top">
                <td>Gradient Boosting Classifier</td>
                <td>0.68</td>
                <td>0.67 (0.31)</td>
                <td>0.66</td>
                <td>0.66 (0.32)</td>
                <td>0.65 (0.03)</td>
                <td>0.85</td>
              </tr>
              <tr valign="top">
                <td>Extra trees</td>
                <td>0.66</td>
                <td>0.67 (0.34)</td>
                <td>0.65</td>
                <td>0.66 (0.33)</td>
                <td>0.64 (0.04)</td>
                <td>0.91</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.65</td>
                <td>0.64 (0.34)</td>
                <td>0.65</td>
                <td>0.65 (0.34)</td>
                <td>0.64 (0.04)</td>
                <td>0.85</td>
              </tr>
              <tr valign="top">
                <td>Extreme Gradient Boosting</td>
                <td>0.65</td>
                <td>0.64 (0.33)</td>
                <td>0.67</td>
                <td>0.66 (0.33)</td>
                <td>0.65 (0.03)</td>
                <td>0.87</td>
              </tr>
              <tr valign="top">
                <td>Light Gradient Boosting Machine</td>
                <td>0.63</td>
                <td>0.62 (0.33)</td>
                <td>0.64</td>
                <td>0.62 (0.33)</td>
                <td>0.62 (0.03)</td>
                <td>0.79</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>LOPOCV: leave-one-out cross-validation.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>LOPOCV, mean (SD), and nested CV, mean (SD), refer to participant-averaged accuracy.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>CV: cross-validation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The highest accuracy values for the ensemble models ET, GBC, RF, and XGBoost were obtained under 10-fold CV. For example, ET achieved 0.91 accuracy under 10-fold CV and 0.66 under LOPOCV. LR and LDA exhibited relatively stable accuracy across validation methods, with differences ≤0.03 between LOPOCV and 10-fold CV.</p>
        <p>Accuracy estimates under nested CV closely matched those from LOPOCV, differing by ≤0.02 across all classifiers. Group K-fold (K=3) CV produced intermediate accuracy scores, typically falling between LOPOCV and 10-fold CV. Full evaluation metrics (accuracy, <italic>F</italic><sub>1</sub>-score, precision, recall, and Matthews correlation coefficient) for each model are provided in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Train-Test Gap and Overfitting Risk</title>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref>A summarizes the mean (SD) train-test gaps for each classifier under LOPOCV and nested LOPOCV; exact values are provided in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Gaps ranged from 0.08 to 0.36 across models. Ensemble methods (ET, RF, and GBC) showed the largest gaps (≥0.34) under both validation strategies, whereas LR and LDA were approximately 0.08. Gap estimates were similar for LOPOCV and the nested scheme, with no consistent differences across models. <xref rid="figure3" ref-type="fig">Figure 3</xref>B shows participant-level distributions. Ensemble models displayed wider IQRs and more outliers, whereas linear models were more tightly clustered around their medians, consistent with <xref rid="figure3" ref-type="fig">Figure 3</xref>A.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Generalization gap under participant-aware validation. (A) Mean train-test gap for each classifier under leave-one-out cross-validation (LOPOCV) and nested LOPOCV; bars represent the mean across participants and error bars indicate SD, with smaller gaps reflecting better generalization. (B) Participant-level dispersion of the train-test gap for the same classifiers and validation schemes, shown as boxplots (center line indicates median; box represents IQR; whiskers are 1.5×IQR; points indicate outliers). Wider IQRs and more frequent outliers—particularly for ensemble models—indicate greater instability in participant-level generalization. Complete numerical results for all models and both schemes are provided in Table S4 in Multimedia Appendix 1. ADA: AdaBoost; AUC: area under the curve; CV: cross-validation; ET: extra trees; GBC: Gradient Boosting Classifier; KNN: k-nearest neighbors; 
LDA: linear discriminant analysis; LGBM: Light Gradient Boosting Machine; LR: logistic regression; QDA: quadratic discriminant analysis; RF: random forest; XGBoost: Extreme Gradient Boosting.</p>
          </caption>
          <graphic xlink:href="ai_v5i1e87728_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Inline graphic 1.</p>
          </caption>
          <graphic xlink:href="ai_v5i1e87728_fig4.png" alt-version="no" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Bias Between Validation Strategies</title>
        <p>Bias (Δ) for all classifiers is reported in Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and visualized in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (diverging scale centered at 0). Across classifiers, 10-fold CV exhibits the largest positive Δ relative to participant-aware baselines, consistent with performance inflation when trials from the same participant appear in both training and test sets. By contrast, LOPOCV, nested LOPOCV, and group K-fold yield Δ values approximately 0 across metrics. For example, KNN shows marked increases relative to LOPOCV under 10-fold CV (accuracy +0.22, Matthews correlation coefficient +0.82, MAB 0.36 in Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), whereas LR shows no measurable difference between nested LOPOCV and Group K-fold.</p>
      </sec>
      <sec>
        <title>Model Ranking Consistency Across Participants</title>
        <p><xref ref-type="table" rid="table3">Table 3</xref> summarizes participant-level rank statistics for each classifier under LOPOCV and nested CV. For each model, the table displays the mean rank, SD, best rank, and worst rank across all participants. Lower rank values correspond to higher participant-level accuracy. Across both validation strategies, ET yielded the lowest mean ranks (5.19 for LOPOCV and 5.39 for nested CV), whereas LGBM had the highest (6.05 and 6.06, respectively). SDs ranged from 1.89 (RF, LOPOCV) to 2.79 (ADA, nested CV).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Model ranking statistics under leave-one-participant-out cross-validation rank and nested cross-validation<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="400"/>
            <col width="130"/>
            <col width="70"/>
            <col width="100"/>
            <col width="0"/>
            <col width="150"/>
            <col width="70"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td colspan="4">Leave-one-participant-out cross-validation rank</td>
                <td colspan="3">Nested cross-validation rank</td>
              </tr>
              <tr valign="bottom">
                <td>
                  <break/>
                </td>
                <td>Mean (SD)<sup>b</sup></td>
                <td>Best</td>
                <td>Worst</td>
                <td colspan="2">Mean (SD)<sup>b</sup></td>
                <td>Best</td>
                <td>Worst</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>k-Nearest neighbors</td>
                <td>5.39 (2.65)</td>
                <td>1</td>
                <td>10</td>
                <td colspan="2">5.34 (2.63)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>5.51 (2.70)</td>
                <td>1</td>
                <td>10</td>
                <td colspan="2">5.42 (2.70)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost</td>
                <td>5.64 (2.57)</td>
                <td>1</td>
                <td>10</td>
                <td colspan="2">5.81 (2.79)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Linear discriminant analysis</td>
                <td>5.55 (2.70)</td>
                <td>1</td>
                <td>10</td>
                <td colspan="2">5.52 (2.74)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Quadratic discriminant analysis</td>
                <td>5.28 (2.43)</td>
                <td>1</td>
                <td>10</td>
                <td colspan="2">5.27 (2.52)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Gradient Boosting Classifier</td>
                <td>5.22 (2.15)</td>
                <td>1</td>
                <td>9</td>
                <td colspan="2">5.47 (2.24)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Extra trees</td>
                <td>5.19 (2.16)</td>
                <td>1</td>
                <td>10</td>
                <td colspan="2">5.39 (2.02)</td>
                <td>2</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>5.49 (1.89)</td>
                <td>1.5</td>
                <td>10</td>
                <td colspan="2">5.37 (2.06)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Extreme Gradient Boosting</td>
                <td>5.68 (2.48)</td>
                <td>1</td>
                <td>10</td>
                <td colspan="2">5.37 (2.23)</td>
                <td>1</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Light Gradient Boosting Machine</td>
                <td>6.05 (2.34)</td>
                <td>1.5</td>
                <td>10</td>
                <td colspan="2">6.06 (2.14)</td>
                <td>2</td>
                <td>10</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Lower mean ranks and SDs indicate more consistent participant-level performance. Rankings were based on individual classification accuracy within each outer test fold.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Best-rank values were fractional (eg, [1+2]/2=1.5) when multiple models achieved identical participant-level accuracy and tied for top performance. In such cases, ranks were averaged according to standard ranking conventions [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>All classifiers achieved a best rank of 1.0 for at least one participant, with fractional values (eg, 1.5) indicating ties averaged according to standard conventions. The worst rank was uniformly 10.0 for all classifiers. Differences in rank between LOPOCV and nested CV were small. For 8 of the 10 models, the absolute difference in mean rank was ≤0.25. Changes in rank variability (SD) were also limited, with maximum observed shifts of 0.31 in mean rank (XGBoost) and 0.22 in SD (ADA).</p>
      </sec>
      <sec>
        <title>Computational Efficiency Analysis</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> presents the average training time per outer fold, inference time per sample, and final model size for each classifier under 4 validation strategies. Training time varied by more than 4 orders of magnitude across models. The fastest models—KNN, QDA, LDA, and LR—completed training in under 0.02 seconds per fold across all validation strategies. LGBM required 275 seconds per fold under nested CV and 125 seconds under 10-fold CV, even without tuning. XGBoost was more efficient than the other 3 ensembles during tuning, requiring approximately 3.7 seconds per fold.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Training time, inference latency, and model size by classifier and cross-validation strategy.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="130"/>
            <col width="70"/>
            <col width="60"/>
            <col width="90"/>
            <col width="80"/>
            <col width="0"/>
            <col width="80"/>
            <col width="60"/>
            <col width="80"/>
            <col width="70"/>
            <col width="0"/>
            <col width="70"/>
            <col width="60"/>
            <col width="90"/>
            <col width="60"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td colspan="5">Training time (seconds)<sup>a</sup></td>
                <td colspan="5">Inference time (ms/sample)<sup>b</sup></td>
                <td colspan="4">Model size (MB)<sup>c</sup></td>
              </tr>
              <tr valign="bottom">
                <td>
                  <break/>
                </td>
                <td>10-fold CV<sup>d</sup></td>
                <td>Group CV</td>
                <td>LOPOCV<sup>e</sup></td>
                <td>Nested CV<sup>f</sup></td>
                <td colspan="2">10-fold CV</td>
                <td>Group CV</td>
                <td>LOPOCV</td>
                <td>Nested CV<sup>f</sup></td>
                <td colspan="2">10-fold CV</td>
                <td>Group CV</td>
                <td>LOPOCV</td>
                <td>Nested CV<sup>f</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>k-Nearest neighbors</td>
                <td>0.012</td>
                <td>0.012</td>
                <td>0.012</td>
                <td>0.026</td>
                <td colspan="2">0.00<sup>g</sup></td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.101</td>
                <td>0.074</td>
                <td>0.110</td>
                <td>0.110</td>
              </tr>
              <tr valign="top">
                <td>Logistic regression</td>
                <td>0.013</td>
                <td>0.013</td>
                <td>0.013</td>
                <td>0.018</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.001</td>
                <td>0.001</td>
                <td>0.001</td>
                <td>0.001</td>
              </tr>
              <tr valign="top">
                <td>AdaBoost</td>
                <td>0.084</td>
                <td>0.077</td>
                <td>0.094</td>
                <td>0.183</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.026</td>
                <td>0.026</td>
                <td>0.026</td>
                <td>0.032</td>
              </tr>
              <tr valign="top">
                <td>Linear discriminant analysis</td>
                <td>0.013</td>
                <td>0.012</td>
                <td>0.013</td>
                <td>0.016</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.002</td>
                <td>0.002</td>
                <td>0.002</td>
                <td>0.002</td>
              </tr>
              <tr valign="top">
                <td>Quadratic discriminant analysis</td>
                <td>0.012</td>
                <td>0.012</td>
                <td>0.012</td>
                <td>0.016</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.003</td>
                <td>0.003</td>
                <td>0.003</td>
                <td>0.003</td>
              </tr>
              <tr valign="top">
                <td>Gradient Boosting Classifier</td>
                <td>0.157</td>
                <td>0.133</td>
                <td>0.165</td>
                <td>0.604</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.110</td>
                <td>0.108</td>
                <td>0.110</td>
                <td>0.231</td>
              </tr>
              <tr valign="top">
                <td>Extra trees</td>
                <td>0.138</td>
                <td>0.141</td>
                <td>0.141</td>
                <td>0.476</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>2.00</td>
                <td>2.00</td>
                <td colspan="2">3.053</td>
                <td>2.125</td>
                <td>3.285</td>
                <td>3.980</td>
              </tr>
              <tr valign="top">
                <td>Random forest</td>
                <td>0.327</td>
                <td>0.308</td>
                <td>0.326</td>
                <td>0.600</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>2.00</td>
                <td>2.00</td>
                <td colspan="2">1.728</td>
                <td>1.240</td>
                <td>1.849</td>
                <td>1.449</td>
              </tr>
              <tr valign="top">
                <td>Extreme Gradient Boosting</td>
                <td>1.268</td>
                <td>1.317</td>
                <td>1.228</td>
                <td>3.708</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.211</td>
                <td>0.208</td>
                <td>0.211</td>
                <td>0.201</td>
              </tr>
              <tr valign="top">
                <td>Light Gradient Boosting Machine</td>
                <td>125.24</td>
                <td>95.41</td>
                <td>143.22</td>
                <td>275.48</td>
                <td colspan="2">0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td>0.00</td>
                <td colspan="2">0.214</td>
                <td>0.163</td>
                <td>0.232</td>
                <td>0.201</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Training time reflects the mean wall-clock duration per fold.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>Inference time is reported per sample and rounded to the nearest millisecond.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>Model size is based on the serialized object in memory.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>CV: cross-validation.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>LOPOCV: leave-one-out cross-validation.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>Nested CV includes tuning overhead via GridSearchCV.</p>
            </fn>
            <fn id="table4fn7">
              <p><sup>g</sup>&lt;0.005 ms/sample.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Inference times were uniformly low. All models classified a test sample in ≤2 milliseconds per sample. ET and RF had nonzero inference times (2 ms), but all models were well within real-time application thresholds. Model sizes remained modest. LR had the smallest footprint (~1 kB). ET under nested CV had the largest size (~4 MB), followed by RF and GBC. LGBM maintained a small model size (~0.23 MB) despite its long training time.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The main finding of this study is that CV strategy substantially influences reported model performance in repeated-measures classification. Stratified 10-fold CV systematically overestimated accuracy (eg, ET: 0.91 vs 0.66 under LOPOCV) due to participant-level data leakage. Participant-aware strategies (LOPOCV, group K-fold, and nested CV) produced more conservative and stable estimates. The nested LOPOCV + group K-fold framework achieved the best balance among unbiased evaluation, participant-aware separation, and hyperparameter tuning decoupling, with minimal additional bias compared with standalone LOPOCV.</p>
      </sec>
      <sec>
        <title>Theoretical Rationale for Participant-Aware Validation</title>
        <p>Accurate model evaluation in repeated-measures datasets requires validation strategies that respect the nested data structure, specifically the dependency among trials within each participant. Classical CV methods assume independent and identically distributed observations, an assumption that is violated when multiple measurements come from the same individual.</p>
        <p>Stratified K-fold CV maintains class balance across folds but splits individual trials randomly, often placing multiple trials from the same participant in both the training and test sets. This introduces participant-specific information leakage, which in turn inflates reported performance metrics and misrepresents generalizability [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>LOPOCV addresses this leakage by holding out all trials from a single participant in each fold. This ensures participant-level independence during evaluation [<xref ref-type="bibr" rid="ref8">8</xref>]. However, LOPOCV does not prevent selection bias: if hyperparameter tuning uses data from the same participants later used for evaluation, model selection remains compromised. Group K-fold (K=3) CV offers a partial solution by assigning all trials from a participant to the same fold; however, when used for both tuning and evaluation, it still allows indirect bias to persist.</p>
        <p>To resolve both leakage and selection bias, this study implemented a nested CV structure: LOPOCV for outer-loop evaluation and group K-fold (K=3) for inner-loop tuning. This structure ensures that evaluation is based on participants unseen during training and tuning. Furthermore, tuning leverages repeated trials from different individuals, handling intrasubject variability without reintroducing test data. This approach aligns with established recommendations for unbiased evaluation in supervised learning [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p>
        <p>A key rationale behind this design lies in recognizing that, in repeated-measures contexts, the effective sample size approximates the number of participants, not the total number of trials [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Trial-level CV methods that ignore this dependency systematically underestimate generalization error and lead to inflated performance metrics, particularly for models that exploit individual-specific patterns.</p>
      </sec>
      <sec>
        <title>Effects of Validation Strategy on Model Performance</title>
        <p>Stratified 10-fold CV produced the highest reported accuracies but inflated generalization estimates, with differences of up to 0.25 compared with LOPOCV (eg, ET: 0.91 vs 0.66). Among participant-aware methods, high-capacity classifiers (KNN, ET, and GBC) still exhibited train-test gaps ≥0.30 under standalone LOPOCV, suggesting overfitting to participant-idiosyncratic patterns when tuning was not decoupled from evaluation.</p>
        <p>Group K-fold (K=3) preserved participant identity while reducing computational burden, with performance comparable to LOPOCV and stable rankings. Nested CV produced the most cautious and consistent estimates across classifiers: it minimized validation bias, narrowed train-test gaps, and maintained ranking consistency, yielding a representative estimate of out-of-sample performance. A related SRSH study using participant-wise evaluation reported 75.6% accuracy, which aligns with the participant-aware range observed here and contrasts with the optimistic trial-wise figures [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        <p>These findings answer research questions 1 and 2:</p>
        <list list-type="bullet">
          <list-item>
            <p>Research question 1: LOPOCV reduces leakage but may not prevent overfitting when intrasubject variability is not managed.</p>
          </list-item>
          <list-item>
            <p>Research question 2: A nested participant-aware framework provides the most robust and unbiased evaluation of model generalization in repeated-measures data.</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Model Stability and Ranking Consistency</title>
        <p>Although the top classifiers showed comparable average accuracies, their rankings differed considerably at the individual participant level. Specifically, ET and RF demonstrated greater ranking stability across participants compared with LGBM and ADA, which showed higher variability. Despite these individual differences, rankings remained highly stable between LOPOCV and nested CV. For 8 out of 10 models, the absolute difference in mean rank between the 2 schemes was ≤0.25, and the change in rank SD was ≤0.17, supporting the conclusion that participant-aware tuning does not distort model comparisons.</p>
        <p>These findings directly address research question 3, emphasizing the practical value of incorporating participant-level validation frameworks when deploying ML models in real-world scenarios. While model stability is essential, computational demands must also be considered when selecting validation strategies for deployment.</p>
      </sec>
      <sec>
        <title>Efficiency and Practical Considerations</title>
        <p>Validation strategies varied considerably in computational cost. Nested CV incurred the highest training time, especially for ensemble models (eg, LGBM: &gt;275 seconds/fold), due to inner-loop hyperparameter tuning. By contrast, linear models (eg, LR and LDA) trained in under 0.02 seconds per fold, underscoring the substantial overhead introduced by nested designs. Group K-fold (K=3) CV offered a computationally efficient compromise, producing performance estimates comparable to LOPOCV with minimal bias while avoiding the full tuning cost of nested CV.</p>
        <p>Despite differences in training time, all models exhibited low inference latency (≤2 ms/sample) and small memory footprints, making them suitable for deployment in real-time settings once trained. In practice, training efficiency—not inference cost—should guide validation strategy selection, particularly in clinical or iterative modeling contexts where scalability and runtime constraints are critical.</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>This study was designed to isolate the impact of validation strategy on performance estimation rather than to maximize predictive accuracy. Several limitations should be acknowledged. First, the analysis was restricted to manually engineered features and narrow hyperparameter grids. While this ensured consistency across models, broader, systematically documented optimization techniques, such as Bayesian search or automated feature extraction, may yield different performance rankings. Second, the dataset was limited to a binary classification task in 72 individuals (43 males and 29 females; mean age 28.4 years, SD 6.7 years) following ACLR with hamstring autograft, all recruited 6-24 months after surgery. The extent to which the observed validation discrepancies generalize to cohorts with different demographic profiles (eg, slightly older populations, different graft types, time since surgery, or varying preinjury activity levels) remains an open question. Future studies should also test generalizability in multiclass problems, longitudinal predictions, and additional clinical contexts. Third, although the study included a range of linear and ensemble learning algorithms, it did not evaluate deep neural networks, which may be more sensitive to overfitting and leakage in repeated-measures data. Applying participant-aware nested validation to recurrent or convolutional neural networks, particularly in high-dimensional time-series biomechanics, remains a critical direction for future research. Fourth, this study did not address probability calibration or algorithmic fairness, both of which are essential for the deployment of clinical ML. Future work should integrate calibration metrics (eg, Brier score, calibration curves) and evaluate fairness-aware performance under participant-aware validation schemes. Finally, feature-level interpretation was out of scope. We focused on evaluating models rather than identifying which features drive predictions. Importance estimates depend on the validation design and may be unstable under trial-wise splits. We therefore reserve feature attribution for future work conducted strictly under the nested, participant-aware protocol established here, ensuring that any reported importances reflect leakage-free evaluation.</p>
      </sec>
      <sec>
        <title>Comparison With Previous Literature</title>
        <p>The limitations of stratified CV in repeated-measures settings have been well documented. Early theoretical work identified selection bias and performance inflation when training and evaluation are not properly separated [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. These studies laid the foundation for participant-aware evaluation, though they did not explicitly address repeated measures or participant-specific dependency structures.</p>
        <p>More recent applied studies in neuroimaging, biomechanics, and mobile health have adopted LOPOCV or group CV to control for participant-level leakage (eg, [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]). However, many of these implementations lack full tuning-evaluation decoupling, often using the same CV scheme for both hyperparameter selection and final testing. This results in partial leakage and residual bias, even when participants are held out during the final evaluation.</p>
        <p>Prior work has noted the risk of data leakage from trial-wise CV and the value of participant-aware evaluation [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. However, no study has systematically benchmarked a fully nested CV design (LOPOCV outer loop or group K-fold inner loop) for repeated-measures classification in biomechanics across multiple classifiers and evaluation dimensions. This study provides such a benchmark, quantifying classification performance, validation bias, train-test generalization, participant-level ranking stability, and computational efficiency across 4 strategies and 10 classifiers. A comparative overview of previous methodologies and their limitations is provided in <xref ref-type="table" rid="table5">Table 5</xref>, highlighting how this study addresses critical gaps in the current literature.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Summary of prior studies on CV<sup>a</sup> strategies in repeated-measures machine learning.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="160"/>
            <col width="130"/>
            <col width="160"/>
            <col width="250"/>
            <col width="300"/>
            <thead>
              <tr valign="top">
                <td>Study</td>
                <td>Domain</td>
                <td>Validation methods</td>
                <td>Key contribution</td>
                <td>Limitations</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Varma and Simon [<xref ref-type="bibr" rid="ref15">15</xref>]</td>
                <td>Bioinformatics</td>
                <td>Nested vs nonnested CV</td>
                <td>Theoretical analysis of overfitting and bias</td>
                <td>Did not address repeated measures or participant-dependent data</td>
              </tr>
              <tr valign="top">
                <td>Cawley and Talbot [<xref ref-type="bibr" rid="ref16">16</xref>]</td>
                <td>Machine learning theory</td>
                <td>Model selection bias</td>
                <td>Highlighted the need for a nested CV in selection</td>
                <td>No empirical comparison of CV types</td>
              </tr>
              <tr valign="top">
                <td>Krstajic et al [<xref ref-type="bibr" rid="ref31">31</xref>]</td>
                <td>Chemoinformatics</td>
                <td>K-fold and CV pitfalls</td>
                <td>Explained how improper CV inflates model complexity</td>
                <td>No participant separation or tuning evaluation</td>
              </tr>
              <tr valign="top">
                <td>Sohrab et al [<xref ref-type="bibr" rid="ref9">9</xref>]</td>
                <td>Digital mental health</td>
                <td>K-fold vs LOPOCV<sup>b</sup></td>
                <td>Demonstrated leakage risk in repeated trials</td>
                <td>Did not evaluate tuning or rank stability</td>
              </tr>
              <tr valign="top">
                <td>Steyerberg and Harrell [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
                <td>Model development</td>
                <td>Bootstrapping</td>
                <td>Advocated internal/external validation</td>
                <td>Not focused on repeated-measures structure</td>
              </tr>
              <tr valign="top">
                <td>Varoquaux et al [<xref ref-type="bibr" rid="ref8">8</xref>]</td>
                <td>Neuroimaging</td>
                <td>LOPOCV and nested CV</td>
                <td>Advocated for subject-aware design in brain decoding</td>
                <td>Limited metric scope; no bias, time, or rank evaluation</td>
              </tr>
              <tr valign="top">
                <td>Neto et al [<xref ref-type="bibr" rid="ref33">33</xref>]</td>
                <td>Digital health diagnostics</td>
                <td>Permutation tests and CV</td>
                <td>Detected identity confounding due to improper CV</td>
                <td>Proposed correction, but no comparative benchmarking</td>
              </tr>
              <tr valign="top">
                <td>Xu and Goodacre [<xref ref-type="bibr" rid="ref34">34</xref>]</td>
                <td>General machine learning</td>
                <td>CV, bootstrapping, and systematic</td>
                <td>Compared methods for small-sample generalization</td>
                <td>Did not address participant-dependent data</td>
              </tr>
              <tr valign="top">
                <td>Heo et al [<xref ref-type="bibr" rid="ref35">35</xref>]</td>
                <td>Stroke outcomes</td>
                <td>Retrospective CV</td>
                <td>Compared multiple machine learning models for prediction</td>
                <td>No subject-aware splits or nested tuning</td>
              </tr>
              <tr valign="top">
                <td>Dehghani et al [<xref ref-type="bibr" rid="ref36">36</xref>]</td>
                <td>Human activity recognition</td>
                <td>Record-wise vs subject-wise CV</td>
                <td>Subject-wise CV prevents leakage and improves realism</td>
                <td>Did not include ranking or tuning comparisons</td>
              </tr>
              <tr valign="top">
                <td>Ferdinandy et al [<xref ref-type="bibr" rid="ref37">37</xref>]</td>
                <td>Animal behavior</td>
                <td>K-fold and leave-one-out cross-validation</td>
                <td>Showed that correlation in behavior data skews CV estimates</td>
                <td>No nested tuning; focused on exploratory data</td>
              </tr>
              <tr valign="top">
                <td>Islam et al [<xref ref-type="bibr" rid="ref38">38</xref>]</td>
                <td>Genomic prediction</td>
                <td>5-fold across cycles</td>
                <td>Incorporated repeated cycles in prediction</td>
                <td>Did not assess CV design effects systematically</td>
              </tr>
              <tr valign="top">
                <td>Wilimitis and Walsh [<xref ref-type="bibr" rid="ref22">22</xref>]</td>
                <td>Clinical machine learning</td>
                <td>Group CV and LOPOCV</td>
                <td>Tutorial on subject-aware evaluation</td>
                <td>No computational benchmarking or model ranking</td>
              </tr>
              <tr valign="top">
                <td>Lu et al [<xref ref-type="bibr" rid="ref39">39</xref>]</td>
                <td>Postsurgical infection</td>
                <td>Bootstrapping + K-fold</td>
                <td>Empirical uncertainty analysis of prediction</td>
                <td>No subject-level CV or overfitting assessment</td>
              </tr>
              <tr valign="top">
                <td>Chen et al [<xref ref-type="bibr" rid="ref40">40</xref>]</td>
                <td>Surgical infection</td>
                <td>Calibration + CV</td>
                <td>Evaluated clinical machine learning predictions with calibration</td>
                <td>Lacked focus on repeated measures or leakage</td>
              </tr>
              <tr valign="top">
                <td>Ghasemzadeh et al [<xref ref-type="bibr" rid="ref41">41</xref>]</td>
                <td>Speech and hearing</td>
                <td>K-fold and nested CV</td>
                <td>Nested CV improved generalization and sample efficiency</td>
                <td>No participant-level data structure handling</td>
              </tr>
              <tr valign="top">
                <td>Lumumba et al [<xref ref-type="bibr" rid="ref42">42</xref>]</td>
                <td>General machine learning</td>
                <td>Leave-one-out cross-validation, K-fold, and repeated K-fold</td>
                <td>Repeated K-fold improved stability in imbalanced data</td>
                <td>Did not address subject-specific leakage</td>
              </tr>
              <tr valign="top">
                <td>This study</td>
                <td>Biomechanics/clinical machine learning</td>
                <td>10-fold, group CV, LOPOCV, and nested LOPOCV</td>
                <td>Jointly evaluates subject-aware nested CV with bias, generalization, model ranking, and efficiency metrics</td>
                <td>Benchmarking subject-aware validation for repeated-measures movement data under real-world constraints</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>CV: cross-validation.</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>LOPOCV: leave-one-out cross-validation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Regulatory Relevance</title>
        <p>Subject-aware validation aligns with emerging regulatory expectations for clinical ML systems, although direct regulatory conclusions should not be drawn from a single case study. The nested CV framework demonstrated here ensures subject-aware separation in both evaluation and hyperparameter tuning, reducing the risk of performance inflation, an issue highlighted in regulatory and ethical reviews of artificial intelligence (AI) systems. Several regulatory documents and frameworks explicitly support these principles:</p>
        <list list-type="bullet">
          <list-item>
            <p>TRIPOD+AI [<xref ref-type="bibr" rid="ref17">17</xref>] calls for complete and reproducible reporting of validation procedures that mimic intended use scenarios.</p>
          </list-item>
          <list-item>
            <p>The FDA’s Good Machine Learning Practice guidelines [<xref ref-type="bibr" rid="ref43">43</xref>] stress the need for strict separation between training and test data, comprehensive documentation of data lineage, and bias minimization.</p>
          </list-item>
          <list-item>
            <p>The EU Medical Device Regulation (2017/745) [<xref ref-type="bibr" rid="ref44">44</xref>] and the proposed Artificial Intelligence Act (2023) [<xref ref-type="bibr" rid="ref45">45</xref>] categorize clinical AI tools as high-risk technologies, requiring developers to present robust performance evidence and demonstrate algorithmic transparency.</p>
          </list-item>
        </list>
        <p>By aligning with these standards, the nested CV approach not only enhances internal validity but also contributes to regulatory readiness. It provides a principled validation pathway aligned with international expectations for safety, accountability, and trustworthiness in AI-driven medical tools.</p>
      </sec>
      <sec>
        <title>Summary of Implications</title>
        <p>This study demonstrates that the validation strategy critically influences model trustworthiness in repeated-measures classification. Trial-level CV overstates performance by ignoring participant dependencies, while subject-aware methods, particularly nested CV, offer more realistic and unbiased estimates of generalization. When full nesting is computationally infeasible, group K-fold (K=3) CV offers a practical alternative, maintaining participant-level separation with minimal validation bias. In deployment scenarios, where model outputs influence clinical or behavioral decisions, variability in participant-level predictions may undermine reliability.</p>
        <p>While this study used balanced class distributions, real-world applications often involve class imbalance and prevalence shifts. In such contexts, metrics such as positive predictive value and negative predictive value may vary despite stable sensitivity and specificity [<xref ref-type="bibr" rid="ref46">46</xref>]. Future implementations should consider calibration and context-specific performance metrics to ensure reliability under deployment conditions.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study highlights the critical role of validation strategy in evaluating ML models trained on repeated-measures data. Through a comparative evaluation of 4 CV methods—stratified 10-fold CV, group K-fold (K=3) CV, LOPOCV, and nested LOPOCV—we demonstrate that standard approaches often overestimate model performance by failing to account for participant-level dependencies.</p>
        <p>Nested, subject-aware validation provided the most reliable generalization estimates in this repeated-measures setting, and our efficiency analysis indicates viable lower-cost options when full nesting is impractical. Classifier stability, train-test gap, and computational efficiency were all found to vary substantially across validation schemes, showing that validation design materially affects performance estimates relevant for deployment.</p>
        <p>These results have practical implications for clinical model development, where decisions based on inflated metrics may compromise safety and reproducibility. The nested subject-aware framework proposed here offers a principled path toward transparent and trustworthy ML systems aligned with current regulatory expectations in health care and other domains characterized by repeated-measures designs. Subject-aware validation is not merely a methodological refinement but a practical necessity for developing reproducible, generalizable, and deployment-ready ML systems in biomechanics, behavioral research, and clinical monitoring.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>TRIPOD+AI (Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis—Artificial Intelligence) compliance checklist and additional analysis.</p>
        <media xlink:href="ai_v5i1e87728_app1.docx" xlink:title="DOCX File , 336 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ACLR</term>
          <def>
            <p>anterior cruciate ligament reconstruction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ADA</term>
          <def>
            <p>AdaBoost</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CV</term>
          <def>
            <p>cross-validation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">ET</term>
          <def>
            <p>extra trees</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">FN</term>
          <def>
            <p>false negative</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">FP</term>
          <def>
            <p>false positive</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">GBC</term>
          <def>
            <p>Gradient Boosting Classifier</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">KNN</term>
          <def>
            <p>k-nearest neighbors</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">LDA</term>
          <def>
            <p>linear discriminant analysis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">LGBM</term>
          <def>
            <p>Light Gradient Boosting Machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">LOPOCV</term>
          <def>
            <p>leave-one-out cross-validation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">MAB</term>
          <def>
            <p>mean absolute bias</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">QDA</term>
          <def>
            <p>quadratic discriminant analysis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">SRSH</term>
          <def>
            <p>standardized rebound side hops</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">TN</term>
          <def>
            <p>true negative</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">TP</term>
          <def>
            <p>true positive</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">TRIPOD+AI</term>
          <def>
            <p>Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis—Artificial Intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb22">XGBoost</term>
          <def>
            <p>Extreme Gradient Boosting</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Umeå University School of Sport Science (grant Karbalaie IH 5.1-7-2025); the Swedish Research Council (grant Häger 2017-00892,2022-00774); Region Västerbotten (Häger Cutting Edge funding [grant RV966109], ALF funding [grant RV967112]); and the King Gustaf V and Queen Victoria’s Foundation of Freemasons. Additional support was provided by the Stockholm Medical Artificial Intelligence and Learning Environments (SMAILE) core facility at Karolinska Institutet, Stockholm, Sweden.</p>
    </ack>
    <notes>
      <title>Data Availability</title>
      <p>The datasets analyzed during this study are not publicly available due to Swedish legislation regarding patient safety and medical confidentiality; however, deidentified data are available from the corresponding author (FA) on reasonable request, subject to jurisdiction. The complete codebase and MLflow logs are publicly available online [<xref ref-type="bibr" rid="ref47">47</xref>].</p>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>AK conceptualized and designed the study, conducted data analysis and interpretation, and drafted the original manuscript. FA contributed to conceptualization, methodology development, formal analysis, and critically reviewed and edited the manuscript. CKH was responsible for data collection, provided the data, secured funding for the study, and contributed to the writing of the final manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Halilaj</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Rajagopal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fiterau</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hicks</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Delp</surname>
              <given-names>SL</given-names>
            </name>
          </person-group>
          <article-title>Machine learning in human movement biomechanics: best practices, common pitfalls, and new opportunities</article-title>
          <source>J Biomech</source>
          <year>2018</year>
          <month>11</month>
          <day>16</day>
          <volume>81</volume>
          <fpage>1</fpage>
          <lpage>11</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30279002"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbiomech.2018.09.009</pub-id>
          <pub-id pub-id-type="medline">30279002</pub-id>
          <pub-id pub-id-type="pii">S0021-9290(18)30730-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC6879187</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McGinley</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Baker</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wolfe</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Morris</surname>
              <given-names>ME</given-names>
            </name>
          </person-group>
          <article-title>The reliability of three-dimensional kinematic gait measurements: a systematic review</article-title>
          <source>Gait Posture</source>
          <year>2009</year>
          <month>04</month>
          <volume>29</volume>
          <issue>3</issue>
          <fpage>360</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1016/j.gaitpost.2008.09.003</pub-id>
          <pub-id pub-id-type="medline">19013070</pub-id>
          <pub-id pub-id-type="pii">S0966-6362(08)00264-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tubez</surname>
              <given-names>François</given-names>
            </name>
            <name name-style="western">
              <surname>Forthomme</surname>
              <given-names>Bénédicte</given-names>
            </name>
            <name name-style="western">
              <surname>Croisier</surname>
              <given-names>Jean-Louis</given-names>
            </name>
            <name name-style="western">
              <surname>Brüls</surname>
              <given-names>Olivier</given-names>
            </name>
            <name name-style="western">
              <surname>Denoël</surname>
              <given-names>Vincent</given-names>
            </name>
            <name name-style="western">
              <surname>Paulus</surname>
              <given-names>Julien</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>Cédric</given-names>
            </name>
          </person-group>
          <article-title>Inter-session reliability of the tennis serve and influence of the laboratory context</article-title>
          <source>J Hum Kinet</source>
          <year>2019</year>
          <month>03</month>
          <volume>66</volume>
          <fpage>57</fpage>
          <lpage>67</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://hdl.handle.net/2268/242163"/>
          </comment>
          <pub-id pub-id-type="doi">10.2478/hukin-2018-0064</pub-id>
          <pub-id pub-id-type="medline">30988840</pub-id>
          <pub-id pub-id-type="pii">hukin-2018-0064</pub-id>
          <pub-id pub-id-type="pmcid">PMC6458585</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Powell</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Fong</surname>
              <given-names>HB</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>AK</given-names>
            </name>
          </person-group>
          <article-title>Increasing breast support is associated with altered knee joint stiffness and contributing knee joint biomechanics during treadmill running</article-title>
          <source>Front Sports Act Living</source>
          <year>2023</year>
          <month>4</month>
          <day>21</day>
          <volume>5</volume>
          <fpage>1113952</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37152112"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fspor.2023.1113952</pub-id>
          <pub-id pub-id-type="medline">37152112</pub-id>
          <pub-id pub-id-type="pmcid">PMC10160436</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Keogh</surname>
              <given-names>JAJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ruder</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>White</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gavrilov</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Phillips</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Heisz</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kobsar</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Longitudinal monitoring of biomechanical and psychological state in collegiate female basketball athletes using principal component analysis</article-title>
          <source>Transl Sports Med</source>
          <year>2024</year>
          <month>4</month>
          <day>3</day>
          <volume>2024</volume>
          <fpage>7858835</fpage>
          <lpage>14</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38654723"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2024/7858835</pub-id>
          <pub-id pub-id-type="medline">38654723</pub-id>
          <pub-id pub-id-type="pmcid">PMC11023736</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pereira</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gadhoumi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pelter</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Do</surname>
              <given-names>DH</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Colorado</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Meisel</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Photoplethysmography based atrial fibrillation detection: a review</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <volume>3</volume>
          <fpage>3</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-019-0207-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-019-0207-9</pub-id>
          <pub-id pub-id-type="medline">31934647</pub-id>
          <pub-id pub-id-type="pii">207</pub-id>
          <pub-id pub-id-type="pmcid">PMC6954115</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sobolev</surname>
              <given-names>M G</given-names>
            </name>
            <name name-style="western">
              <surname>Teja</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Tauhidur</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Advancing the science of digital biomarkers</article-title>
          <source>DigiBiom '21: Proceedings of the 2021 Workshop on Future of Digital Biomarkers</source>
          <year>2021</year>
          <conf-name>MobiSys '21: The 19th Annual International Conference on Mobile Systems, Applications, and Services</conf-name>
          <conf-date>June 25, 2021</conf-date>
          <conf-loc>Virtual</conf-loc>
          <fpage>1</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1145/3469266.3473711</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Raamana</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>Engemann</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Hoyos-Idrobo</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Assessing and tuning brain decoders: cross-validation, caveats, and guidelines</article-title>
          <source>Neuroimage</source>
          <year>2017</year>
          <month>01</month>
          <day>15</day>
          <volume>145</volume>
          <issue>Pt B</issue>
          <fpage>166</fpage>
          <lpage>179</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neuroimage.2016.10.038</pub-id>
          <pub-id pub-id-type="medline">27989847</pub-id>
          <pub-id pub-id-type="pii">S1053-8119(16)30595-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Saeb</surname>
              <given-names>Sohrab</given-names>
            </name>
            <name name-style="western">
              <surname>Lonini</surname>
              <given-names>Luca</given-names>
            </name>
            <name name-style="western">
              <surname>Jayaraman</surname>
              <given-names>Arun</given-names>
            </name>
            <name name-style="western">
              <surname>Mohr</surname>
              <given-names>David C</given-names>
            </name>
            <name name-style="western">
              <surname>Kording</surname>
              <given-names>Konrad P</given-names>
            </name>
          </person-group>
          <article-title>The need to approximate the use-case in clinical machine learning</article-title>
          <source>Gigascience</source>
          <year>2017</year>
          <month>05</month>
          <day>01</day>
          <volume>6</volume>
          <issue>5</issue>
          <fpage>1</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28327985"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/gigascience/gix019</pub-id>
          <pub-id pub-id-type="medline">28327985</pub-id>
          <pub-id pub-id-type="pii">3071704</pub-id>
          <pub-id pub-id-type="pmcid">PMC5441397</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Poldrack</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Huckins</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Establishment of best practices for evidence for prediction: a review</article-title>
          <source>JAMA Psychiatry</source>
          <year>2020</year>
          <month>05</month>
          <day>01</day>
          <volume>77</volume>
          <issue>5</issue>
          <fpage>534</fpage>
          <lpage>540</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31774490"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamapsychiatry.2019.3671</pub-id>
          <pub-id pub-id-type="medline">31774490</pub-id>
          <pub-id pub-id-type="pii">2756204</pub-id>
          <pub-id pub-id-type="pmcid">PMC7250718</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Serbecic</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Beutelspacher</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Markovic</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Roy</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Shetty</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Repeatability and reproducibility of corneal biomechanical parameters derived from Corvis ST</article-title>
          <source>Eur J Ophthalmol</source>
          <year>2020</year>
          <month>11</month>
          <day>20</day>
          <volume>30</volume>
          <issue>6</issue>
          <fpage>1287</fpage>
          <lpage>1294</lpage>
          <pub-id pub-id-type="doi">10.1177/1120672119864554</pub-id>
          <pub-id pub-id-type="medline">31744320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jie</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Corneal biomechanical properties in a selected Chinese population, measured using the corneal visualization Scheimpflug technology</article-title>
          <source>Front Bioeng Biotechnol</source>
          <year>2022</year>
          <volume>10</volume>
          <fpage>863240</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35497328"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fbioe.2022.863240</pub-id>
          <pub-id pub-id-type="medline">35497328</pub-id>
          <pub-id pub-id-type="pii">863240</pub-id>
          <pub-id pub-id-type="pmcid">PMC9043322</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Masson</surname>
              <given-names>A O</given-names>
            </name>
            <name name-style="western">
              <surname>Besler</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Edwards</surname>
              <given-names>W B</given-names>
            </name>
            <name name-style="western">
              <surname>Krawetz</surname>
              <given-names>R J</given-names>
            </name>
          </person-group>
          <article-title>High spatial resolution analysis using automated indentation mapping differentiates biomechanical properties of normal vs. degenerated articular cartilage in mice</article-title>
          <source>Elife</source>
          <year>2022</year>
          <month>11</month>
          <day>29</day>
          <volume>11</volume>
          <fpage>1</fpage>
          <lpage>18</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36444976"/>
          </comment>
          <pub-id pub-id-type="doi">10.7554/eLife.74664</pub-id>
          <pub-id pub-id-type="medline">36444976</pub-id>
          <pub-id pub-id-type="pii">74664</pub-id>
          <pub-id pub-id-type="pmcid">PMC9750174</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Karbalaie</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Strong</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nordström</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schelin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Selling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Grip</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Prorok</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Häger</surname>
              <given-names>C K</given-names>
            </name>
          </person-group>
          <article-title>Beyond self-reports after anterior cruciate ligament injury - machine learning methods for classifying and identifying movement patterns related to fear of re-injury</article-title>
          <source>J Sports Sci</source>
          <year>2026</year>
          <month>02</month>
          <day>23</day>
          <volume>44</volume>
          <issue>3</issue>
          <fpage>342</fpage>
          <lpage>356</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/10.1080/02640414.2025.2578584?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/02640414.2025.2578584</pub-id>
          <pub-id pub-id-type="medline">41131712</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Varma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Simon</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Bias in error estimation when using cross-validation for model selection</article-title>
          <source>BMC Bioinformatics</source>
          <year>2006</year>
          <month>02</month>
          <day>23</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>91</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-7-91"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-7-91</pub-id>
          <pub-id pub-id-type="medline">16504092</pub-id>
          <pub-id pub-id-type="pii">1471-2105-7-91</pub-id>
          <pub-id pub-id-type="pmcid">PMC1397873</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cawley</surname>
              <given-names>G C</given-names>
            </name>
            <name name-style="western">
              <surname>Talbot</surname>
              <given-names>N L C</given-names>
            </name>
          </person-group>
          <article-title>On over-fitting in model selection and subsequent selection bias in performance evaluation</article-title>
          <source>The Journal of Machine Learning Research</source>
          <year>2010</year>
          <volume>11</volume>
          <fpage>2179</fpage>
          <lpage>2107</lpage>
          <pub-id pub-id-type="doi">10.5555/1756006.1859921</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>G S</given-names>
            </name>
            <name name-style="western">
              <surname>Moons</surname>
              <given-names>K G M</given-names>
            </name>
            <name name-style="western">
              <surname>Dhiman</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>Ri D</given-names>
            </name>
            <name name-style="western">
              <surname>Beam</surname>
              <given-names>A L</given-names>
            </name>
            <name name-style="western">
              <surname>Van Calster</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Reitsma</surname>
              <given-names>J B</given-names>
            </name>
            <name name-style="western">
              <surname>Van Smeden</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Boulesteix</surname>
              <given-names>A L</given-names>
            </name>
            <name name-style="western">
              <surname>Camaradou</surname>
              <given-names>J C</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>L A</given-names>
            </name>
            <name name-style="western">
              <surname>Denaxas</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Denniston</surname>
              <given-names>A K</given-names>
            </name>
            <name name-style="western">
              <surname>Glocker</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Golub</surname>
              <given-names>R M</given-names>
            </name>
            <name name-style="western">
              <surname>Harvey</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Heinze</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffman</surname>
              <given-names>M M</given-names>
            </name>
            <name name-style="western">
              <surname>Kengne</surname>
              <given-names>A P</given-names>
            </name>
            <name name-style="western">
              <surname>Lam</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Loder</surname>
              <given-names>E W</given-names>
            </name>
            <name name-style="western">
              <surname>Maier-Hein</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mateen</surname>
              <given-names>B A</given-names>
            </name>
            <name name-style="western">
              <surname>McCradden</surname>
              <given-names>M D</given-names>
            </name>
            <name name-style="western">
              <surname>Oakden-Rayner</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ordish</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Parnell</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rose</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wynants</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Logullo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods</article-title>
          <source>BMJ</source>
          <year>2024</year>
          <month>04</month>
          <day>16</day>
          <volume>385</volume>
          <fpage>e078378</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bmj.com/lookup/pmidlookup?view=long&amp;pmid=38626948"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj-2023-078378</pub-id>
          <pub-id pub-id-type="medline">38626948</pub-id>
          <pub-id pub-id-type="pmcid">PMC11019967</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Markström</surname>
              <given-names>J L</given-names>
            </name>
            <name name-style="western">
              <surname>Schelin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Häger</surname>
              <given-names>C K</given-names>
            </name>
          </person-group>
          <article-title>A novel standardised side hop test reliably evaluates landing mechanics for anterior cruciate ligament reconstructed persons and controls</article-title>
          <source>Sports Biomech</source>
          <year>2021</year>
          <month>03</month>
          <day>10</day>
          <volume>20</volume>
          <issue>2</issue>
          <fpage>213</fpage>
          <lpage>229</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/10.1080/14763141.2018.1538385?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/14763141.2018.1538385</pub-id>
          <pub-id pub-id-type="medline">30526381</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Markström</surname>
              <given-names>Jonas L</given-names>
            </name>
            <name name-style="western">
              <surname>Grinberg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Häger</surname>
              <given-names>Charlotte K</given-names>
            </name>
          </person-group>
          <article-title>Fear of reinjury following anterior cruciate ligament reconstruction is manifested in muscle activation patterns of single-leg side-hop landings</article-title>
          <source>Phys Ther</source>
          <year>2022</year>
          <month>02</month>
          <day>01</day>
          <volume>102</volume>
          <issue>2</issue>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34554253"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ptj/pzab218</pub-id>
          <pub-id pub-id-type="medline">34554253</pub-id>
          <pub-id pub-id-type="pii">6373317</pub-id>
          <pub-id pub-id-type="pmcid">PMC8860189</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Christ</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Braun</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Neuffer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kempa-Liehr</surname>
              <given-names>AW</given-names>
            </name>
          </person-group>
          <article-title>Time series feature extraction on basis of scalable hypothesis tests (tsfresh – a Python package)</article-title>
          <source>Neurocomputing</source>
          <year>2018</year>
          <month>09</month>
          <volume>307</volume>
          <fpage>72</fpage>
          <lpage>77</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2018.03.067</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arlot</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Celisse</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A survey of cross-validation procedures for model selection</article-title>
          <source>Statist Surv</source>
          <year>2010</year>
          <month>1</month>
          <day>1</day>
          <volume>4</volume>
          <issue>none</issue>
          <fpage>40</fpage>
          <lpage>79</lpage>
          <pub-id pub-id-type="doi">10.1214/09-SS054</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilimitis</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Walsh</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Practical considerations and applied examples of cross-validation for model development and evaluation in health care: tutorial</article-title>
          <source>JMIR AI</source>
          <year>2023</year>
          <month>12</month>
          <day>18</day>
          <volume>2</volume>
          <fpage>e49023</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ai.jmir.org/2023//e49023/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/49023</pub-id>
          <pub-id pub-id-type="medline">38875530</pub-id>
          <pub-id pub-id-type="pii">v2i1e49023</pub-id>
          <pub-id pub-id-type="pmcid">PMC11041453</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Demsar</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Statistical comparisons of classifiers over multiple data sets</article-title>
          <source>Journal of Machine Learning Research</source>
          <year>2006</year>
          <volume>7</volume>
          <fpage>1</fpage>
          <lpage>30</lpage>
          <pub-id pub-id-type="doi">10.5555/1248547.1248548</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>García</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fernández</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Luengo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Herrera</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Advanced nonparametric tests for multiple comparisons in the design of experiments in computational intelligence and data mining: experimental analysis of power</article-title>
          <source>Information Sciences</source>
          <year>2010</year>
          <month>05</month>
          <volume>180</volume>
          <issue>10</issue>
          <fpage>2044</fpage>
          <lpage>2064</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ins.2009.12.010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pedregosa</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Thirion</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grisel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Blondel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Prettenhofer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dubourg</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vanderplas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Passos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cournapeau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brucher</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perrot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Duchesnay</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Scikit-learn: machine learning in Python</article-title>
          <source>The Journal of Machine Learning Research</source>
          <year>2015</year>
          <month>06</month>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>2825</fpage>
          <lpage>2830</lpage>
          <pub-id pub-id-type="doi">10.5555/1953048.2078195</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>XGBoost: a scalable tree boosting system</article-title>
          <year>2016</year>
          <conf-name>22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>August 13-17, 2016</conf-date>
          <conf-loc>San Francisco, CA</conf-loc>
          <fpage>785</fpage>
          <lpage>794</lpage>
          <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ke</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Finley</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T Y</given-names>
            </name>
          </person-group>
          <article-title>LightGBM: a highly efficient gradient boosting decision tree</article-title>
          <year>2017</year>
          <conf-name>31st International Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <fpage>785</fpage>
          <lpage>794</lpage>
          <pub-id pub-id-type="doi">10.5555/3294996.3295074</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vabalas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gowen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Poliakoff</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Casson</surname>
              <given-names>AJ</given-names>
            </name>
          </person-group>
          <article-title>Machine learning algorithm validation with a limited sample size</article-title>
          <source>PLoS One</source>
          <year>2019</year>
          <month>11</month>
          <day>7</day>
          <volume>14</volume>
          <issue>11</issue>
          <fpage>e0224365</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0224365"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0224365</pub-id>
          <pub-id pub-id-type="medline">31697686</pub-id>
          <pub-id pub-id-type="pii">PONE-D-19-13163</pub-id>
          <pub-id pub-id-type="pmcid">PMC6837442</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Verbeke</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Molenberghs</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rizopoulos</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Random effects models for longitudinal data</article-title>
          <source>Longitudinal Research with Latent Variables</source>
          <year>2010</year>
          <publisher-loc>Berlin, Heidelberg</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marusich</surname>
              <given-names>LR</given-names>
            </name>
            <name name-style="western">
              <surname>Bakdash</surname>
              <given-names>JZ</given-names>
            </name>
          </person-group>
          <article-title>rmcorrShiny: a web and standalone application for repeated measures correlation</article-title>
          <source>F1000Res</source>
          <year>2021</year>
          <month>11</month>
          <day>8</day>
          <volume>10</volume>
          <fpage>697</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34621514"/>
          </comment>
          <pub-id pub-id-type="doi">10.12688/f1000research.55027.2</pub-id>
          <pub-id pub-id-type="medline">34621514</pub-id>
          <pub-id pub-id-type="pmcid">PMC8456376</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krstajic</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Buturovic</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Leahy</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Cross-validation pitfalls when selecting and assessing regression and classification models</article-title>
          <source>J Cheminform</source>
          <year>2014</year>
          <month>03</month>
          <day>29</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>10</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.doi.org/10.1186/1758-2946-6-10"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1758-2946-6-10</pub-id>
          <pub-id pub-id-type="medline">24678909</pub-id>
          <pub-id pub-id-type="pii">10.1186/1758-2946-6-10</pub-id>
          <pub-id pub-id-type="pmcid">PMC3994246</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steyerberg</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Harrell</surname>
              <given-names>FE</given-names>
            </name>
          </person-group>
          <article-title>Prediction models need appropriate internal, internal-external, and external validation</article-title>
          <source>J Clin Epidemiol</source>
          <year>2016</year>
          <month>01</month>
          <volume>69</volume>
          <fpage>245</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25981519"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jclinepi.2015.04.005</pub-id>
          <pub-id pub-id-type="medline">25981519</pub-id>
          <pub-id pub-id-type="pii">S0895-4356(15)00175-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC5578404</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Neto</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Pratap</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Perumal</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Tummalacherla</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bot</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Trister</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Friend</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Mangravite</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Omberg</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Learning disease vs participant signatures: a permutation test approach to detect identity confounding in machine learning diagnostic applications</article-title>
          <source>arXiv. Published online December 8, 2017</source>
          <year>2017</year>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/1712.03120"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Goodacre</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>On splitting training and validation set: a comparative study of cross-validation, bootstrap and systematic sampling for estimating the generalization performance of supervised learning</article-title>
          <source>J Anal Test</source>
          <year>2018</year>
          <month>10</month>
          <day>29</day>
          <volume>2</volume>
          <issue>3</issue>
          <fpage>249</fpage>
          <lpage>262</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30842888"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s41664-018-0068-2</pub-id>
          <pub-id pub-id-type="medline">30842888</pub-id>
          <pub-id pub-id-type="pii">68</pub-id>
          <pub-id pub-id-type="pmcid">PMC6373628</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>YD</given-names>
            </name>
            <name name-style="western">
              <surname>Nam</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Heo</surname>
              <given-names>JH</given-names>
            </name>
          </person-group>
          <article-title>Machine learning–based model for prediction of outcomes in acute stroke</article-title>
          <source>Stroke</source>
          <year>2019</year>
          <month>05</month>
          <volume>50</volume>
          <issue>5</issue>
          <fpage>1263</fpage>
          <lpage>1265</lpage>
          <pub-id pub-id-type="doi">10.1161/strokeaha.118.024293</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dehghani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Glatard</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shihab</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Subject cross validation in human activity recognition</article-title>
          <source>arXiv. Published online April 4, 2019</source>
          <year>2019</year>
          <fpage>1904</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1904.02666"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1904.02666</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ferdinandy</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gerencsér</surname>
              <given-names>Linda</given-names>
            </name>
            <name name-style="western">
              <surname>Corrieri</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Újváry</surname>
              <given-names>Dóra</given-names>
            </name>
            <name name-style="western">
              <surname>Csizmadia</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Miklósi</surname>
              <given-names>Ádám</given-names>
            </name>
          </person-group>
          <article-title>Challenges of machine learning model validation using correlated behaviour data: evaluation of cross-validation strategies and accuracy measures</article-title>
          <source>PLoS One</source>
          <year>2020</year>
          <month>7</month>
          <day>20</day>
          <volume>15</volume>
          <issue>7</issue>
          <fpage>e0236092</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0236092"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0236092</pub-id>
          <pub-id pub-id-type="medline">32687528</pub-id>
          <pub-id pub-id-type="pii">PONE-D-20-06582</pub-id>
          <pub-id pub-id-type="pmcid">PMC7371169</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Islam</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>McCord</surname>
              <given-names>PH</given-names>
            </name>
            <name name-style="western">
              <surname>Olatoye</surname>
              <given-names>MO</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sood</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lipka</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Todd</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <article-title>Experimental evaluation of genomic selection prediction for rust resistance in sugarcane</article-title>
          <source>Plant Genome</source>
          <year>2021</year>
          <month>11</month>
          <day>12</day>
          <volume>14</volume>
          <issue>3</issue>
          <fpage>e20148</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.1002/tpg2.20148"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/tpg2.20148</pub-id>
          <pub-id pub-id-type="medline">34510803</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Machine learning application for prediction of surgical site infection after posterior cervical surgery</article-title>
          <source>Int Wound J</source>
          <year>2024</year>
          <month>04</month>
          <day>28</day>
          <volume>21</volume>
          <issue>4</issue>
          <fpage>e14607</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38155433"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/iwj.14607</pub-id>
          <pub-id pub-id-type="medline">38155433</pub-id>
          <pub-id pub-id-type="pmcid">PMC10961862</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhan</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Using machine learning to predict surgical site infection after lumbar spine surgery</article-title>
          <source>IDR</source>
          <year>2023</year>
          <month>08</month>
          <volume>Volume 16</volume>
          <fpage>5197</fpage>
          <lpage>5207</lpage>
          <pub-id pub-id-type="doi">10.2147/idr.s417431</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ghasemzadeh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hillman</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Mehta</surname>
              <given-names>DD</given-names>
            </name>
          </person-group>
          <article-title>Toward generalizable machine learning models in speech, language, and hearing sciences: estimating sample size and reducing overfitting</article-title>
          <source>J Speech Lang Hear Res</source>
          <year>2024</year>
          <month>03</month>
          <day>11</day>
          <volume>67</volume>
          <issue>3</issue>
          <fpage>753</fpage>
          <lpage>781</lpage>
          <pub-id pub-id-type="doi">10.1044/2023_jslhr-23-00273</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lumumba</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kiprotich</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mpaine</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Makena</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kavita</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparative analysis of cross-validation techniques: LOOCV, K-folds cross-validation, and repeated K-folds cross-validation in machine learning models</article-title>
          <source>AJTAS</source>
          <year>2024</year>
          <month>10</month>
          <day>10</day>
          <volume>13</volume>
          <issue>5</issue>
          <fpage>127</fpage>
          <lpage>137</lpage>
          <pub-id pub-id-type="doi">10.11648/j.ajtas.20241305.13</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Petrick</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Delfino</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Gallas</surname>
              <given-names>BD</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Krainak</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sahiner</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Samala</surname>
              <given-names>RK</given-names>
            </name>
          </person-group>
          <article-title>Regulatory considerations for medical imaging AI/ML devices in the United States: concepts and challenges</article-title>
          <source>J Med Imag</source>
          <year>2023</year>
          <month>9</month>
          <day>1</day>
          <volume>10</volume>
          <issue>05</issue>
          <fpage>51804</fpage>
          <pub-id pub-id-type="doi">10.1117/1.jmi.10.5.051804</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <article-title>Regulation (EU) 2017/745 of the European Parliament and of the Council of 5 April 2017 on medical devices, amending Directive 2001/83/EC, Regulation (EC) No 178/2002 and Regulation (EC) No 1223/2009 and repealing Council Directives 90/385/EEC and 93/42/EEC (Text with EEA relevance. )</article-title>
          <source>European Union (Eur-Lex)</source>
          <access-date>2026-04-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://eur-lex.europa.eu/eli/reg/2017/745/oj/eng">https://eur-lex.europa.eu/eli/reg/2017/745/oj/eng</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <article-title>Proposal for a Regulation laying down harmonised rules on artificial intelligence</article-title>
          <source>European Commission</source>
          <year>2021</year>
          <month>4</month>
          <day>21</day>
          <access-date>2026-04-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://digital-strategy.ec.europa.eu/en/library/proposal-regulation-laying-down-harmonised-rules-artificial-intelligence">https://digital-strategy.ec.europa.eu/en/library/proposal-regulation-laying-down-harmonised-rules-artificial-intelligence</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Godau</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kalinowski</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Christodoulou</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Reinke</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tizabi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ferrer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jäger</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Maier-Hein</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Navigating prevalence shifts in image analysis algorithm deployment</article-title>
          <source>Med Image Anal</source>
          <year>2025</year>
          <month>05</month>
          <volume>102</volume>
          <fpage>103504</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1361-8415(25)00052-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.media.2025.103504</pub-id>
          <pub-id pub-id-type="medline">40020420</pub-id>
          <pub-id pub-id-type="pii">S1361-8415(25)00052-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Karbalaie</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Abtahi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Häger</surname>
              <given-names>C K</given-names>
            </name>
          </person-group>
          <article-title>Subject-aware model validation pipeline for repeated-measures data</article-title>
          <source>GitHub</source>
          <year>2025</year>
          <access-date>2025-06-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/abdkar/Nested_LOPOCV">https://github.com/abdkar/Nested_LOPOCV</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
