<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v3i1e48067</article-id>
      <article-id pub-id-type="pmid">38875598</article-id>
      <article-id pub-id-type="doi">10.2196/48067</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Improving Risk Prediction of Methicillin-Resistant Staphylococcus aureus Using Machine Learning Methods With Network Features: Retrospective Development Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Malin</surname>
            <given-names>Bradley</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sewell</surname>
            <given-names>Daniel</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhao</surname>
            <given-names>Beiqun</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kamruzzaman</surname>
            <given-names>Methun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8680-7061</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Heavey</surname>
            <given-names>Jack</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-4351-5104</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Song</surname>
            <given-names>Alexander</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-0367-1304</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Bielskas</surname>
            <given-names>Matthew</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-2327-0690</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Bhattacharya</surname>
            <given-names>Parantapa</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3626-9939</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Madden</surname>
            <given-names>Gregory</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5951-3156</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Klein</surname>
            <given-names>Eili</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1304-5289</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Deng</surname>
            <given-names>Xinwei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1560-2405</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Vullikanti</surname>
            <given-names>Anil</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>University of Virginia</institution>
            <addr-line>Biocomplexity Institute P.O. Box 400298</addr-line>
            <addr-line>Charlottesville, VA, 22904</addr-line>
            <country>United States</country>
            <phone>1 5405773102</phone>
            <email>vsakumar@virginia.edu</email>
          </address>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8597-6197</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>University of Virginia</institution>
        <addr-line>Charlottesville, VA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Division of Infectious Diseases &#38; International Health</institution>
        <institution>Department of Medicine</institution>
        <institution>University of Virginia School of Medicine</institution>
        <addr-line>Charlottesville, VA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Emergency Medicine</institution>
        <institution>Johns Hopkins School of Medicine</institution>
        <addr-line>Baltimore, MD</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Center for Disease Dynamics, Economics and Policy</institution>
        <addr-line>Washington, DC, DC</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Statistics</institution>
        <institution>Virginia Tech</institution>
        <addr-line>Blacksburg, VA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Computer Science</institution>
        <institution>University of Virginia</institution>
        <addr-line>Charlottesville, VA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Anil Vullikanti <email>vsakumar@virginia.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>16</day>
        <month>5</month>
        <year>2024</year>
      </pub-date>
      <volume>3</volume>
      <elocation-id>e48067</elocation-id>
      <history>
        <date date-type="received">
          <day>10</day>
          <month>4</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>28</day>
          <month>9</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>13</day>
          <month>1</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Methun Kamruzzaman, Jack Heavey, Alexander Song, Matthew Bielskas, Parantapa Bhattacharya, Gregory Madden, Eili Klein, Xinwei Deng, Anil Vullikanti. Originally published in JMIR AI (https://ai.jmir.org), 16.05.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2024/1/e48067" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Health care–associated infections due to multidrug-resistant organisms (MDROs), such as methicillin-resistant <italic>Staphylococcus aureus</italic> (MRSA) and <italic>Clostridioides difficile</italic> (CDI), place a significant burden on our health care infrastructure.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Screening for MDROs is an important mechanism for preventing spread but is resource intensive. The objective of this study was to develop automated tools that can predict colonization or infection risk using electronic health record (EHR) data, provide useful information to aid infection control, and guide empiric antibiotic coverage.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We retrospectively developed a machine learning model to detect MRSA colonization and infection in undifferentiated patients at the time of sample collection from hospitalized patients at the University of Virginia Hospital. We used clinical and nonclinical features derived from on-admission and throughout-stay information from the patient’s EHR data to build the model. In addition, we used a class of features derived from contact networks in EHR data; these network features can capture patients’ contacts with providers and other patients, improving model interpretability and accuracy for predicting the outcome of surveillance tests for MRSA. Finally, we explored heterogeneous models for different patient subpopulations, for example, those admitted to an intensive care unit or emergency department or those with specific testing histories, which perform better.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We found that the penalized logistic regression performs better than other methods, and this model’s performance measured in terms of its receiver operating characteristics-area under the curve score improves by nearly 11% when we use polynomial (second-degree) transformation of the features. Some significant features in predicting MDRO risk include antibiotic use, surgery, use of devices, dialysis, patient’s comorbidity conditions, and network features. Among these, network features add the most value and improve the model’s performance by at least 15%. The penalized logistic regression model with the same transformation of features also performs better than other models for specific patient subpopulations.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our study shows that MRSA risk prediction can be conducted quite effectively by machine learning methods using clinical and nonclinical features derived from EHR data. Network features are the most predictive and provide significant improvement over prior methods. Furthermore, heterogeneous prediction models for different patient subpopulations enhance the model’s performance.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>methicillin-resistant Staphylococcus aureus</kwd>
        <kwd>network</kwd>
        <kwd>machine learning</kwd>
        <kwd>penalized logistic regression</kwd>
        <kwd>ensemble learning</kwd>
        <kwd>gradient-boosted classifier</kwd>
        <kwd>random forest classifier</kwd>
        <kwd>extreme boosted gradient boosted classifier</kwd>
        <kwd>Shapley Additive Explanations</kwd>
        <kwd>SHAP</kwd>
        <kwd>health care–associated infection</kwd>
        <kwd>HAI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Multidrug-resistant organisms (MDROs), such as <italic>Clostridioides difficile</italic> (CDI), multidrug-resistant gram-negative bacteria (carbapenem-resistant <italic>Acinetobacter baumannii</italic> and carbapenem-resistant Enterobacterales), methicillin-resistant <italic>Staphylococcus aureus</italic> (MRSA), and vancomycin-resistant enterococci, are among the top 10 threats to global health [<xref ref-type="bibr" rid="ref1">1</xref>]. Health care–associated infections (HAIs) due to MDROs are associated with increased complications, longer hospital stays, and increased mortality. For example, Weiner-Lastinger et al [<xref ref-type="bibr" rid="ref2">2</xref>] report that HAIs have resulted in billions of dollars in increased healthcare costs [<xref ref-type="bibr" rid="ref3">3</xref>]. MRSA is one of the most common causes of HAIs and a serious antimicrobial resistance threat, responsible for &#62;10,000 deaths a year in the United States alone [<xref ref-type="bibr" rid="ref4">4</xref>]. Similar to many other MDROs, MRSA can be easily spread in a hospital from hospitalized patients via contact with the health care environment (ie, shared patient rooms) and health care workers.</p>
      <p>Antimicrobial stewardship, which seeks to optimize antibiotic treatment regimens, and infection prevention and control, which involves monitoring, investigating, and managing factors related to MDRO transmission, are the main tools for mitigating the risks of acquisition and severe outcomes of MDROs [<xref ref-type="bibr" rid="ref5">5</xref>]. Surveillance testing is a critical component of both antimicrobial stewardship and infection prevention control. However, testing is expensive and slow; current laboratory procedures typically require at least 72 hours to report MRSA found in a patient’s culture [<xref ref-type="bibr" rid="ref6">6</xref>]. The delay in testing results in three problems in the hospital: (1) colonized patients remain undetected, leading to potential spread; (2) clinicians treat infections empirically; and (3) increased resource use for contact precautions, leading to both over- and undertreatment.</p>
      <p>While several different studies have examined MRSA risk prediction (eg, [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]), none to date have progressed to clinical practice due to limitations in generalizability, sample size, and imbalanced data (these are discussed further in the Discussion section). In this study, we demonstrate how improving the hospital context, particularly how patients are connected, can improve the performance of machine learning methods for predicting the outcomes of MRSA surveillance tests, using a rich set of clinical and nonclinical features derived from on-admission and throughout-stay information from a large electronic health record (EHR) data set for patients admitted to the University of Virginia (UVA) Hospital.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Set</title>
        <p>We used patient data from the UVA Hospital during 2010-2022. Overall, 27,612 patients in the dataset were tested for MRSA, and 4171 (15.11%) of them were positive; these patients had 37,237 hospital encounters. The data of each patient’s visit can be separated into two parts: (1) on-admission data and (2) clinical event or throughout-stay data, which we have described here:</p>
        <p><italic>On-admission</italic> data consist of patient demographics and visit information. Patient demographics include information about age, gender, race, ethnicity, country, and state. Visit information includes admission and discharge dates, admission source, admission type, and discharge destination.</p>
        <p>Clinical event data represent information collected during the visit. We considered the following event data:</p>
        <list list-type="bullet">
          <list-item>
            <p>Procedure: it includes the following kinds of events during this visit or at any time 90 days before this visit: (1) surgeries, (2) device implant or replacement, and (3) dialysis. For a visit, no data after the test collection are used.</p>
          </list-item>
          <list-item>
            <p>Medication: as MRSA is resistant to specific antibiotics, we also examined prior antibiotic use. We computed the
            <italic>Days on Therapy</italic>, which indicates whether a patient takes any antibiotic on any specific day. This feature also calculates whether a patient took any antibiotic in the last 90 days of this hospital visit.</p>
          </list-item>
          <list-item>
            <p>Comorbidity: the International Classification of Diseases, Tenth Revision, code of a patient, which is collected from that patient’s medical history, is used to pull comorbidity information using the comorbidity package in R programming language (R Foundation for Statistical Computing). Both Charlson and Elixhauser scores are pulled. It involves other physical conditions such as diabetes, a history of stroke, and a history of dementia.</p>
          </list-item>
          <list-item>
            <p>MRSA laboratory test: we included both (1) clinical cultures and blood, respiratory, and urine samples collected as part of routine care, which typically requires 48 to 72 hours to return results, and (2) polymerase chain reaction (PCR) surveillance tests, which are administered to MRSA-negative patients admitted to an intensive care unit (ICU; per current hospital policy) or per physician request and typically return results in &#60;72 hours. While surveillance tests provide positive and negative results, clinical cultures may be sent from specimens that are not expected to yield MRSA, even in the presence of an active MRSA infection; therefore, a negative clinical culture result is not considered a definite indicator of noninfection. The nares MRSA PCR likely has equal or higher sensitivity than the nares culture for MRSA [<xref ref-type="bibr" rid="ref14">14</xref>]. We noted that, in general, testing is not completely unbiased (a patient with an MRSA-positive result admitted to an ICU would not technically need to be screened if they are already on precautions), which might impact the quality of the data set and the results, as we discuss later in the Discussion section.</p>
          </list-item>
        </list>
        <p>We applied state-of-the-art machine learning methods to predict the risk of MRSA infection at a given time for a patient, modeled by the outcome of a surveillance test. The data set is split into training (80%) and testing (20%) portions. The model is estimated using the training data, and the hyperparameters are chosen by cross-validation. There are many metrics to evaluate model performance. We used receiver operating characteristics-area under the curve (ROC-AUC) as the overall performance metric of the model (the model evaluation metrics are described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), and a higher value is better. For clinicians, an important objective is to reduce the number of false-negative cases. Therefore, we also used the <italic>false negative rate</italic>  <inline-graphic xlink:href="ai_v3i1e48067_fig8.png" xlink:type="simple" mimetype="image"/>to evaluate the model performance, with a lower value indicating a lower false-negative prediction. The overall model performance is proportional to the ROC-AUC score and inversely proportional to the FNR score.</p>
      </sec>
      <sec>
        <title>Problem Statement</title>
        <p>The d-days ahead model’s MRSA test prediction problem: using features defined from the patient EHR data till some time (<italic>t’ = t – d</italic>) predict the outcome of an MRSA surveillance test performed at time <italic>t</italic>. Formally, let <italic>x(t’)</italic> denote a feature vector for a patient defined till time <italic>t</italic> and let <italic>y(t)</italic> denote the result of an MRSA surveillance test performed at time <italic>t</italic>. The objective is to predict if y(t) = 1 using <italic>x(t’)</italic>.</p>
        <p>The specific questions we study are as follows:</p>
        <list list-type="order">
          <list-item>
            <p>How well can MRSA surveillance test results be predicted? What machine learning methods perform well, and what features are the most predictive?</p>
          </list-item>
          <list-item>
            <p>Are better predictions possible for specific, meaningful subpopulations?</p>
          </list-item>
          <list-item>
            <p>How does the performance vary with <italic>d</italic>?</p>
          </list-item>
          <list-item>
            <p>Does training with a biased data set (as performed in previous work) impact the true performance?</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Interesting Features</title>
        <p>Several risk factors for MRSA have been identified in previous studies [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]: (1) hospitalization within the past 6 to 12 months, (2) residing in a chronic care facility, (3) being a health care worker, (5) being an intravenous drug user, (5) frequent antibiotic use, (6) antimicrobial therapy within 1 year, (7) history of endotracheal intubation, (8) underlying chronic disorder, (9) presence of an indwelling venous or urinary catheter, (10) history of any surgical procedure, (11) household contact with an identified risk factor, and (12) hypoalbuminemia. We extracted all the aforementioned features from the UVA data set. We created patient-patient and patient-provider interaction networks and extracted the following features from those networks. In addition, we derived many features based on the existing features described in the subsequent section. The total number of features is 108, and the MRSA test outcome is the target feature.</p>
        <p>1. Network features: we constructed a contact network <italic>G = (V, E)</italic> (as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>), in which we have patient nodes <italic>u<sub>p</sub> ∈ V</italic> for each patient <italic>p</italic> and a provider node <italic>u<sub>h</sub> ∈ V</italic> for each provider <italic>h</italic>. An edge or contact <italic>(u<sub>p1</sub>, u<sub>p2</sub>)</italic> ∈ E between 2 patient nodes u<sub>p1</sub> and u<sub>p2</sub> indicates that both patients p<sub>1</sub> and p<sub>2</sub>, respectively, were colocated (share a common space, a hospital unit in our case) for at least a certain period, in this case at least 900 seconds. Similarly, we defined patient-provider contacts. For instance, in <xref rid="figure1" ref-type="fig">Figure 1</xref>, patient P<sub>1</sub> and provider H<sub>1</sub> are colocated at time t<sub>1</sub>, which is represented as edge (u<italic><sub>p1</sub></italic>, u<italic><sub>h1</sub></italic>). The #provider incidents on patient P<sub>1</sub> in the time interval [<italic>t<sub>1</sub>, t<sub>2</sub></italic>] is 2, whereas in the time interval [<italic>t<sub>1</sub>, t<sub>3</sub></italic>], it is 3. We did not use the number of patients and providers that a patient comes into direct contact with as a feature. Instead, we defined slightly different features based on contacts during a time interval, which we found to be more predictive. We take time to be in days. On the basis of the number of contacts for a patient <italic>p</italic> or a provider <italic>h</italic>
            over a period, we constructed the following features:</p>
        <list list-type="bullet">
          <list-item>
            <p><italic>MRSA α</italic>: for a patient p, S<sub>p,t</sub>(α) = {p’: (u<sub>p</sub>, u<sub>p’</sub>) <italic>∈ E, p’ is labeled positive at time t’ ∈ {t – a, t]}</italic>, denotes the set of patients who came in contact with <italic>p</italic> and tested positive in the last α days. We refer to &#124;S<sub>p,t</sub>(α)&#124; as MRSA α.</p>
          </list-item>
          <list-item>
            <p>Provider β: for a patient p, §<sub>p,t</sub> (β) = {h: (u<sub>p</sub>, u<sub>h</sub>) <italic>∈ E, h visited p at time t’ ∈ (t – β, t]}</italic>. We refer to &#124;§<sub>p,t</sub> (β)&#124; as Provider β.</p>
          </list-item>
          <list-item>
            <p>MRSA positive patients collocated with the patient <italic>l</italic>: at the UVA Hospital, patients with an MRSA-positive result might be “cohorted,” that is, they might share a room because they have similar precautions to improve occupancy. For a patient <italic>p</italic>, let ƒ<sub>p,t</sub>(u, γ) = {p’:(u<sub>p</sub>, u<sub>p’</sub>) ∈ E,  p’ is labeled positive at t’ <italic>∈ (t’ – γ,t]</italic> and is in the hospital unit u with p}. We referred to &#124;ƒ<sub>p,t</sub>(u,γ)&#124; as the number of patients with colocated MRSA.</p>
          </list-item>
          <list-item>
            <p><italic>Bed reuse</italic> Π: let Π<sub>p,t</sub>(x) = {p’: (u<sub>p</sub>, u<sub>p’</sub>) ∉ E, p’ is labeled positive at time t’&#60;t and stayed in the same bed <italic>x</italic>}. We refer to &#124; Π<sub>p,t</sub>(x)&#124; as the number of times Bed <italic>x</italic> reuse.</p>
          </list-item>
        </list>
        <p>Note that all of the aforementioned features are defined for a particular time, t. Therefore, MRSA <italic>α</italic> and other features should be indexed by the patient and time. To avoid notational clutter, we omit them here when they are clear from the context. For example, suppose t<sub>1</sub>=1, t<sub>2</sub>=2, t<sub>3</sub>=3, t<sub>4</sub>=4, and t<sub>5</sub>=5, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. Suppose patient P<sub>2</sub> is tested positive at time 4. Then, for patient P<sub>1</sub>, we would have “MRSA 2” at time t=5 equal to 1, but “MRSA 2” at time t=3 equals 0. For patient P<sub>2</sub>, Provider 2 at time t=2 is 0, but Provider 2 at time t=3 is 1.</p>
        <p>2. Length of stay: for patients <italic>p</italic> in a hospital encounter, let <italic>t<sub>1</sub></italic> denote the admission time and <italic>t</italic> denote the MRSA test time. The corresponding length of hospital stay (before the MRSA test) was computed as t–t1. For the d-days (d ≥ 0) ahead model, we computed the corresponding length of stay (before the MRSA test) as max{t-d-t<sub>1</sub>, 0}. Note that t-d-t<sub>1</sub> could be negative if the patient has not been in the hospital long enough—in this case, we took the length of stay to be 0.</p>
        <p>3. From the health care facility is a Boolean feature that indicates whether the patient is admitted to the hospital from either “skilled nursing, intermediate care, or assisted living facility” or “long term acute care hospital.” For the d-days ahead model, the feature is defined to be 0 if <italic>t<sub>1</sub></italic>-d&#60;0, where <italic>t<sub>1</sub></italic> is the admission date, and 1, otherwise.</p>
        <p>4. δ days observation: we construct several Boolean features based on events in the last δ days before an MRSA test time. For a patient p in a hospital encounter, let T(e) denote the set of times for a specific event e. We defined Boolean variable 
        e<sub>δ</sub>(t)={∃<sub>t1</sub>, t1∈T(e), t<sub>1</sub>&#60;t, 0≤(t-t<sub>1</sub>)≤δ}. We considered δ=90 and e∈{Surgery, Device implant, Antibiotic, Kidney dialysis}. For the d-days ahead model, the feature is defined by considering δ+d as the parameter in the aforementioned definition, instead of δ.</p>
        <p>5. Department-based features: we constructed the following features associated with room stays:</p>
        <list list-type="bullet">
          <list-item>
            <p>ICU: this is a Boolean value that indicates whether a patient is admitted to an ICU.</p>
          </list-item>
          <list-item>
            <p>Emergency department (ED): this is a Boolean value that indicates whether a patient is admitted to the ED.</p>
          </list-item>
        </list>
        <p>As in the aforementioned features, for the d-days ahead model, the feature is defined as 1 if the admission to ICU or ED happened before t-d, where <italic>t</italic> is the MRSA test time.</p>
        <p>6. PHARMCLASS_k: there are 10 PHARMCLASS (penicillins, miscellaneous anti-infectives, cephalosporins, etc) in the data set. Each PHARMCLASS contains a list of antibiotics. For a patient, PHARMCLASS_k contains the number of antibiotic days from the MRSA testing date in the last 90 days. For the d-days ahead model, the feature is the number of antibiotic days in the 90 days before t-d.</p>
        <p>7. Test duration days: for a patient p with an MRSA testing date t, we defined this feature as t-d-t’, if there exists a time t’, t(t’&#60;t) at which an MRSA test was performed for p; otherwise, we defined this feature as 0.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Patient-patient and patient-provider interactions are shown on the timeline, where each box represents a room in the hospital, patients are indicated by circles (marked with P) and health care providers are indicated by triangles (marked with H). Multiple patients could share a room, and a provider might visit multiple patients over time. A network is constructed from these interaction events over time. If 2 patients share a room for a certain period (at least for 15 min), we construct an edge between the corresponding patient nodes; similarly, if a provider visits a patient for a certain period (at least for 15 min), we construct an edge between the corresponding patient and provider nodes.</p>
          </caption>
          <graphic xlink:href="ai_v3i1e48067_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Machine Learning Classifiers</title>
        <sec>
          <title>Overview</title>
          <p>We explored the following machine learning methods: (1) logistic regression (LR; penalized) [<xref ref-type="bibr" rid="ref17">17</xref>], (2) support vector machine [<xref ref-type="bibr" rid="ref18">18</xref>], (3) random forest [<xref ref-type="bibr" rid="ref19">19</xref>], (4) gradient-boosted classifiers, and (5) XGBoost. These methods have been used extensively on EHR data, and our goal was to understand which ones do well for the MRSA risk-prediction problems we considered in this study. We have described these methods in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. We also considered these methods with products of features, that is, of the form x<sub>i</sub>(t)•x<sub>j</sub>(t) where x<sub>i</sub>(t) and x<sub>j</sub>(t) are different components of the feature vector x(t). We also discuss the Shapley Additive Explanations (SHAP) technique for understanding feature importance in each model.</p>
        </sec>
        <sec>
          <title>Model Explainability Using SHAP</title>
          <p>SHAP [<xref ref-type="bibr" rid="ref20">20</xref>] is a visual feature-attribution process that has many applications in explainable artificial intelligence. It uses a game-theoretic methodology to measure the influence of each feature on the target variable of a machine learning model. Visual representations such as the one in <xref rid="figure2" ref-type="fig">Figure 2</xref>, referred to as a summary plot, are used to show the importance of features. The interpretations of this plot are as follows:</p>
          <list list-type="bullet">
            <list-item>
              <p>The y-axis specifies the important features arranged from top to bottom regarding their importance (in descending order) to the response variable (the MRSA test result).</p>
            </list-item>
            <list-item>
              <p>The x-axis indicates the SHAP value of the corresponding feature. The SHAP value of a feature indicates the change in log odds that can be used to extract the probability of success. The color bar on the right-hand side indicates the gradient of log odds from low to high, with the color spectrum from blue to red.</p>
            </list-item>
            <list-item>
              <p>Each point in the SHAP plot for a feature represents an observation of the original data set.</p>
            </list-item>
          </list>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>(A) Performance of models on the test data set: performance of different machine learning models on the entire University of Virginia data set. The penalized logistic regression (LR) model with degree-2 features performs best (the receiver operating characteristics-area under the curve [ROC-AUC] for the LR model without feature transformation to degree-2 is 0.734). (B) The most significant features in this model were identified using Shapley Additive Explanations (SHAP). GBC: gradient boosted classifier; RF: random forest; SVC: support vector classifier.</p>
            </caption>
            <graphic xlink:href="ai_v3i1e48067_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Heterogeneous Risk-Prediction Models for Selected Subpopulations</title>
          <p>To improve performance, we developed heterogeneous subpopulation-specific models as described in the subsequent sections.</p>
          <sec>
            <title>Based on Testing History</title>
            <p>Let K<sub>p,t</sub>∈{+1,-1} denote an MRSA test result for a patient <italic>p</italic> at time <italic>t</italic> in a hospital encounter. The testing history H<sub>p,t</sub> is defined as H<sup>j</sup><sub>p,t</sub>={K<sub>p,ti</sub>:1≤i≤j, t<sub>j</sub>&#60;t<sub>j-1</sub>&#60;...&#60;t<sub>1</sub>&#60;t}. No testing history exists for a newly admitted patient, expressed as H<sub>p,t</sub>=ø. The testing history, considering only the last test result, is expressed as H<sup>1</sup><sub>p,t</sub>={K<sub>p,t1</sub>}. Similarly, the testing history, considering the last 2 test results, is expressed as H<sup>2</sup><sub>p,t</sub>={K<sub>p,t2</sub>}. The number of patients with longer histories drops significantly; therefore, we limited our experiments to the last 2 test results. <xref ref-type="table" rid="table1">Table 1</xref> presents the distribution of data points for the different subpopulations.</p>
            <table-wrap position="float" id="table1">
              <label>Table 1</label>
              <caption>
                <p>Total number of observations and percentages of positive observations for the subpopulations based on different testing histories.</p>
              </caption>
              <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
                <col width="160"/>
                <col width="220"/>
                <col width="200"/>
                <col width="200"/>
                <col width="220"/>
                <thead>
                  <tr valign="top">
                    <td>Previous test history</td>
                    <td>Total observations</td>
                    <td>Current test result (−1)</td>
                    <td>Current test result (+1)</td>
                    <td>Positive observations</td>
                  </tr>
                </thead>
                <tbody>
                  <tr valign="top">
                    <td>None</td>
                    <td>27,612</td>
                    <td>24,371</td>
                    <td>3241</td>
                    <td>11.74</td>
                  </tr>
                  <tr valign="top">
                    <td>–1</td>
                    <td>11,338</td>
                    <td>10,179</td>
                    <td>1159</td>
                    <td>10.22</td>
                  </tr>
                  <tr valign="top">
                    <td>+1</td>
                    <td>3409</td>
                    <td>863</td>
                    <td>2546</td>
                    <td>74.68</td>
                  </tr>
                  <tr valign="top">
                    <td>(–1, –1)</td>
                    <td>4755</td>
                    <td>4320</td>
                    <td>435</td>
                    <td>9.15</td>
                  </tr>
                  <tr valign="top">
                    <td>(–1, +1)</td>
                    <td>635</td>
                    <td>198</td>
                    <td>437</td>
                    <td>68.82</td>
                  </tr>
                  <tr valign="top">
                    <td>(+1, –1)</td>
                    <td>480</td>
                    <td>328</td>
                    <td>152</td>
                    <td>31.67</td>
                  </tr>
                  <tr valign="top">
                    <td>(+1, +1)</td>
                    <td>1486</td>
                    <td>296</td>
                    <td>1190</td>
                    <td>80.00</td>
                  </tr>
                </tbody>
              </table>
            </table-wrap>
          </sec>
          <sec>
            <title>Based on the Admission Source</title>
            <p>Recall the Boolean feature named “From health care facility”, which is 1 if the admission source of a patient is a health care facility. We constructed 2 subpopulations based on whether this feature is 0 or 1; the distributions of these subpopulations and the percentage of positive observations in each are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
            <table-wrap position="float" id="table2">
              <label>Table 2</label>
              <caption>
                <p>Total number of observations and percentages of positive observations for the subpopulations based on different categories.</p>
              </caption>
              <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
                <col width="30"/>
                <col width="290"/>
                <col width="160"/>
                <col width="170"/>
                <col width="170"/>
                <col width="180"/>
                <thead>
                  <tr valign="top">
                    <td colspan="2">Subpopulations</td>
                    <td>Total observations</td>
                    <td>Test result (<italic>−</italic>1)</td>
                    <td>Test result (+1)</td>
                    <td>Positive observations (%)</td>
                  </tr>
                </thead>
                <tbody>
                  <tr valign="top">
                    <td colspan="6">
                      <bold>Admission source</bold>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>Health care facility</td>
                    <td>2241</td>
                    <td>1619</td>
                    <td>622</td>
                    <td>27.76</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>Other</td>
                    <td>42,840</td>
                    <td>36,198</td>
                    <td>6642</td>
                    <td>15.50</td>
                  </tr>
                  <tr valign="top">
                    <td colspan="6">
                      <bold>Department</bold>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>ICU<sup>a</sup></td>
                    <td>27,616</td>
                    <td>24,436</td>
                    <td>3180</td>
                    <td>11.52</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>ED<sup>b</sup></td>
                    <td>2538</td>
                    <td>1658</td>
                    <td>880</td>
                    <td>34.67</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>Other</td>
                    <td>15,201</td>
                    <td>11,918</td>
                    <td>3283</td>
                    <td>21.60</td>
                  </tr>
                  <tr valign="top">
                    <td colspan="6">
                      <bold>Hospital stays (days)</bold>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>≤15</td>
                    <td>39,221</td>
                    <td>32,541</td>
                    <td>6680</td>
                    <td>20.53</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>&#62;15</td>
                    <td>1643</td>
                    <td>1413</td>
                    <td>230</td>
                    <td>16.28</td>
                  </tr>
                  <tr valign="top">
                    <td colspan="6">
                      <bold>Antibiotic use (days)</bold>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>≤90</td>
                    <td>30,776</td>
                    <td>25,065</td>
                    <td>5711</td>
                    <td>18.56</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>&#62;90</td>
                    <td>16,646</td>
                    <td>12,997</td>
                    <td>3649</td>
                    <td>21.92</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>0</td>
                    <td>7097</td>
                    <td>6368</td>
                    <td>729</td>
                    <td>10.27</td>
                  </tr>
                  <tr valign="top">
                    <td colspan="6">
                      <bold>Age group (years)</bold>
                    </td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>0-50</td>
                    <td>14,269</td>
                    <td>12,093</td>
                    <td>2176</td>
                    <td>15.25</td>
                  </tr>
                  <tr valign="top">
                    <td>
                      <break/>
                    </td>
                    <td>≥50</td>
                    <td>27,638</td>
                    <td>23,008</td>
                    <td>4630</td>
                    <td>16.75</td>
                  </tr>
                </tbody>
              </table>
              <table-wrap-foot>
                <fn id="table2fn1">
                  <p><sup>a</sup>ICU: intensive care unit.</p>
                </fn>
                <fn id="table2fn2">
                  <p><sup>b</sup>ED: emergency department.</p>
                </fn>
              </table-wrap-foot>
            </table-wrap>
          </sec>
          <sec>
            <title>Based on Department</title>
            <p>Recall that both ICU and ED are 2 department-based features, which indicate whether the patient is in the ICU and ED, respectively. The distributions of the subpopulations and the percentage of positive observations are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          </sec>
          <sec>
            <title>Based on Hospital Stay</title>
            <p>The feature “<italic>Length of stay</italic>” captures the number of days a patient has been in the hospital till time t-d, where t is the MRSA test date and d ≥ 0 is the parameter for the d-days ahead model. On the basis of this feature, we constructed 2 subpopulations. The first is the group of patients who have stayed in the hospital for at most 15 days, and the second is the group of patients who have stayed there for &#62;15 days. The distribution of these subpopulations and the percentage of positive observations are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          </sec>
          <sec>
            <title>Based on Antibiotic Use</title>
            <p>Three subpopulations were created based on the number of days for which a patient takes an antibiotic: (1) patients who never took any antibiotics, (2) patients who took antibiotics within the last 90 days from the MRSA testing date, and (3) patients who took antibiotics for more than 90 days from the MRSA testing date. The distribution of these subpopulations and the percentage of positive observations are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          </sec>
          <sec>
            <title>Based on Age Group</title>
            <p>A total of 2 age group–specific patient subgroups, namely 0 to 50 and ≥50 years, are considered for the analysis. The distribution of these subpopulations and the percentage of positive observations are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          </sec>
          <sec>
            <title>Hierarchical Subpopulation-Based Models</title>
            <p><xref rid="figure3" ref-type="fig">Figure 3</xref> shows the schematic architecture of the hierarchical model. The construction steps of the hierarchical model are as follows:</p>
            <list list-type="bullet">
              <list-item>
                <p>S1: we defined a set of feature-based rules R at each level to create mutually exclusive subpopulations:</p>
                <list list-type="bullet">
                  <list-item>
                    <p>At level 1, the rules on the feature named ‘Age-group’ are (1) R(α)=patient subgroup of 0 to 50 years old and (2) R(α’)=patient subgroup of more than 50 years old. Each rule creates a patient subpopulation. The patients in these two subpopulations are mutually exclusive, which can be expressed as: P(α)∩P(α’)=∅</p>
                  </list-item>
                  <list-item>
                    <p>At level 2, each age-group-specific subpopulation is subdivided based on another feature named “Department”. The rules on the ‘Department’ feature are (1) R(β)=patient subgroup of ICU and (2) R(γ)=patient subgroup of ED. Patients admitted to other departments are not considered in this model.</p>
                  </list-item>
                  <list-item>
                    <p>The two-level hierarchical structure creates a set of composite rules (combining rules of each level) at the leaf level that we call two-level rules. The rules are as follows: (a) R(α∩β), (b) R(α∩γ), (c) R(α’∩β), and (d) R(α’∩γ).</p>
                  </list-item>
                </list>
              </list-item>
              <list-item>
                <p>S2: the training population is split based on the 2-level rules. Each training subpopulation is trained on several machine learning models, and the best-performing model is used for prediction.</p>
              </list-item>
              <list-item>
                <p>S3: each test observation is passed to the corresponding model using the 2-level rule. The observation with prediction is stored in a buffer. After completing all the testing observations, the buffer is treated as the model’s output.</p>
              </list-item>
            </list>
            <fig id="figure3" position="float">
              <label>Figure 3</label>
              <caption>
                <p>A schematic view of the hierarchical model architecture. In the figure, Xi represents the i-th observation, y is the model prediction, α is the patient subpopulation who are 0 to 50 years old, α' is the patient subpopulation who are more than 50 years old, β is the patient subpopulation who admitted to intensive care unit (ICU) department, γ is the subpopulation who admitted to the emergency department (ED), and R is a feature-based rule to aggregate data. For instance, R(α∩β) is a 0 to 50 age group patient subpopulation admitted to ICU. At level 1, the overall population is subdivided into two subpopulations based on the feature named “Age-group.” The patient subpopulation of age group (0 to 50 years) is mutually exclusive to the patient subpopulation of age group (&#62;50 years). Each age group–specific subpopulation is further subdivided into the next level (level 2) based on another feature named “Department.” The patient subpopulation of the ICU department is mutually exclusive to the ED subpopulation. The training data are split based on the 2-level rules, and each patient subpopulation is trained using the best-fitted model. During the testing phase, each data point passes to the appropriate model using the same 2-level rules, and the best-fitted model predicts the outcome. The outcomes of all the models are merged back into the resultant prediction of this hierarchical model.</p>
              </caption>
              <graphic xlink:href="ai_v3i1e48067_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
        </sec>
        <sec>
          <title>Data Set for d-Days Ahead Prediction</title>
          <p>We prepared a data set to observe the change of prediction performance to the change of <italic>d</italic>, which is discussed in the Methods section. For each <italic>d</italic>∈{1,2,…,7}, we created a data set, where the feature vector for a patient is generated based on the history of that patient till date <italic>t-d</italic>, where t is the MRSA testing date for that patient.</p>
        </sec>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The data used in the paper was obtained through institutional review board approval and is fully anonymized. Therefore, there are no ethical considerations.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Prediction Model for the Entire Population</title>
        <p>We applied multiple machine learning models, including penalized LR, gradient-boosted classifier, Random Forest, support vector classifier, and XGBoost classifier (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>), to the UVA Hospital MRSA patient data sets. We used an 80% to 20% split to construct the train and test data sets. <xref rid="figure2" ref-type="fig">Figure 2</xref>A shows the performance of the models. A model’s best set of hyperparameters was computed from the training data set using grid search and 10-fold cross-validation. Penalized LR was the best-performing model with the corresponding performance metrics: (1) the FNR score is 0.074, and (2) the ROC-AUC score is 0.826. <xref ref-type="table" rid="table3">Table 3</xref> presents other performance metrics for this data set.</p>
        <p>Given the same hyperparameter settings for the penalized LR model, the model performance (ROC-AUC) dropped to 0.734 when we did not consider the product features; therefore, this feature transformation provides a significant benefit. Using the SHAP technique discussed in the Methods section, we extracted the following key features from <xref rid="figure2" ref-type="fig">Figure 2</xref>B:</p>
        <list list-type="order">
          <list-item>
            <p>“AdmissionType_Urgent,” “ICU admitted,” “Provider 7,” and “Provider 14” are the top 4 features. Recall that “AdmissionType_Urgent” is a Boolean variable where the value 1 indicates the patient admitted as “Urgent.” Patients admitted as urgent have a higher likelihood of MRSA infection prediction. Similarly, “ICU admitted” is a Boolean feature where the value 1 indicates that the corresponding patient is admitted to the ICU department and is more likely to predict MRSA infection. On the other hand, “Provider 7” and “Provider 14” indicate the total number of providers a patient contacted in the last 7 and 14 days from the testing date. The higher value of these features is associated with high and negative values for the target feature (MRSA test). A high value comes from the rightmost color bar, and a negative value comes from the x-axis.</p>
          </list-item>
          <list-item>
            <p>A high value of “MRSA 7” (which indicates the total number of patients with an MRSA-positive result a patient contacted in the last 7 days from the testing date) is associated with a high and positive value of the target feature (the MRSA test); this holds similarly for the “MRSA 14” feature.</p>
          </list-item>
          <list-item>
            <p>In addition to single features, composite features also correlate more with MRSA infection prediction. For instance, “AdmissionType Emergency” and “MRSA 7” together (similar to “AdmissionType Emergency” and “MRSA 14”) are associated with high and positive values of the target feature (the MRSA test).</p>
          </list-item>
          <list-item>
            <p>“PHARMCLASS_4” appears to be an important feature compared to the other PHARMCLASS features. In most cases, this variable is associated with high and positive values for the target feature.</p>
          </list-item>
        </list>
        <p>The computational complexity of SHAP increases with the size of the test data set. The best-fitted model is passed to the SHAP explainer method, and it took 5 hours to generate the summary plot (<xref rid="figure2" ref-type="fig">Figure 2</xref>B) when the test data set contains 8174 observations and 4656 features. For the same best-fitted model, the SHAP explainer required 1 hour to generate the summary plot when the test data set contained the same number of observations, but the number of features was reduced to 97. Finally, the time was the same when the number of observations in the test data set was reduced to 817, and the number of features was 4656.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Performance metrics of the best-performing model for each patient subpopulation based on room allocation, admission source, hospital stay, and antibiotic medication period.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="120"/>
            <col width="70"/>
            <col width="120"/>
            <col width="90"/>
            <col width="100"/>
            <col width="100"/>
            <col width="90"/>
            <col width="70"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td>Subpopulation</td>
                <td>Model<sup>a</sup></td>
                <td>ROC-AUC<sup>b</sup></td>
                <td>AUPRC<sup>c</sup></td>
                <td>Sensitivity</td>
                <td>Specificity</td>
                <td>Precision</td>
                <td>FPR<sup>d</sup> or fallout</td>
                <td>FNR<sup>e</sup></td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>MCC<sup>f</sup> score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Overall</td>
                <td>LR<sup>g</sup></td>
                <td>0.826</td>
                <td>0.504</td>
                <td>0.684</td>
                <td>0.797</td>
                <td>0.406</td>
                <td>0.203</td>
                <td>0.074</td>
                <td>0.510</td>
                <td>0.400</td>
              </tr>
              <tr valign="top">
                <td>ICU<sup>h</sup></td>
                <td>LR</td>
                <td>0.876</td>
                <td>0.428</td>
                <td>0.775</td>
                <td>0.826</td>
                <td>0.381</td>
                <td>0.174</td>
                <td>0.036</td>
                <td>0.511</td>
                <td>0.455</td>
              </tr>
              <tr valign="top">
                <td>ED<sup>i</sup></td>
                <td>LR</td>
                <td>
                  <italic>0.936</italic>
                  <sup>j</sup>
                </td>
                <td>
                  <italic>0.882</italic>
                </td>
                <td>
                  <italic>0.878</italic>
                </td>
                <td>
                  <italic>0.886</italic>
                </td>
                <td>
                  <italic>0.800</italic>
                </td>
                <td>
                  <italic>0.114</italic>
                </td>
                <td>0.067</td>
                <td>
                  <italic>0.837</italic>
                </td>
                <td>
                  <italic>0.749</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>Other rooms</td>
                <td>LR</td>
                <td>0.752</td>
                <td>0.451</td>
                <td>0.574</td>
                <td>0.793</td>
                <td>0.389</td>
                <td>0.207</td>
                <td>0.110</td>
                <td>0.463</td>
                <td>0.320</td>
              </tr>
              <tr valign="top">
                <td>From HCF<sup>k</sup></td>
                <td>LR</td>
                <td>0.804</td>
                <td>0.585</td>
                <td>0.536</td>
                <td>0.861</td>
                <td>0.571</td>
                <td>0.139</td>
                <td>0.157</td>
                <td>0.553</td>
                <td>0.405</td>
              </tr>
              <tr valign="top">
                <td>Not from HCF</td>
                <td>LR</td>
                <td>0.831</td>
                <td>0.492</td>
                <td>0.699</td>
                <td>0.801</td>
                <td>0.413</td>
                <td>0.199</td>
                <td>0.070</td>
                <td>0.519</td>
                <td>0.414</td>
              </tr>
              <tr valign="top">
                <td>Hospital stay ≤15 days</td>
                <td>LR</td>
                <td>0.837</td>
                <td>0.518</td>
                <td>0.722</td>
                <td>0.789</td>
                <td>0.415</td>
                <td>0.211</td>
                <td>0.068</td>
                <td>0.527</td>
                <td>0.421</td>
              </tr>
              <tr valign="top">
                <td>Hospital stay <italic>&#62;</italic>15 days</td>
                <td>LR</td>
                <td>0.729</td>
                <td>0.494</td>
                <td>0.596</td>
                <td>0.803</td>
                <td>0.360</td>
                <td>0.197</td>
                <td>0.086</td>
                <td>0.449</td>
                <td>0.331</td>
              </tr>
              <tr valign="top">
                <td>Antibiotic ≤90 days</td>
                <td>LR</td>
                <td>0.826</td>
                <td>0.525</td>
                <td>0.681</td>
                <td>0.807</td>
                <td>0.434</td>
                <td>0.193</td>
                <td>0.079</td>
                <td>0.530</td>
                <td>0.416</td>
              </tr>
              <tr valign="top">
                <td>Antibiotic &#62;90 days</td>
                <td>LR</td>
                <td>0.841</td>
                <td>0.566</td>
                <td>0.697</td>
                <td>0.809</td>
                <td>0.496</td>
                <td>0.191</td>
                <td>0.092</td>
                <td>0.580</td>
                <td>0.453</td>
              </tr>
              <tr valign="top">
                <td>No antibiotic use</td>
                <td>LR</td>
                <td>0.834</td>
                <td>0.328</td>
                <td>0.734</td>
                <td>0.721</td>
                <td>0.201</td>
                <td>0.279</td>
                <td>
                  <italic>0.034</italic>
                </td>
                <td>0.315</td>
                <td>0.275</td>
              </tr>
              <tr valign="top">
                <td>Age group (0-50 years)</td>
                <td>LR</td>
                <td>0.782</td>
                <td>0.482</td>
                <td>0.613</td>
                <td>0.777</td>
                <td>0.364</td>
                <td>0.223</td>
                <td>0.094</td>
                <td>0.457</td>
                <td>0.325</td>
              </tr>
              <tr valign="top">
                <td>Age group (≥50 years)</td>
                <td>LR</td>
                <td>0.833</td>
                <td>0.514</td>
                <td>0.660</td>
                <td>0.817</td>
                <td>0.428</td>
                <td>0.183</td>
                <td>0.079</td>
                <td>0.520</td>
                <td>0.408</td>
              </tr>
              <tr valign="top">
                <td>Hierarchical model<sup>l</sup></td>
                <td>HM</td>
                <td>
                  <italic>0.883</italic>
                </td>
                <td>0.490</td>
                <td>
                  <italic>0.807</italic>
                </td>
                <td>0.832</td>
                <td>0.440</td>
                <td>0.168</td>
                <td>
                  <italic>0.037</italic>
                </td>
                <td>0.569</td>
                <td>0.507</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>This column specifies the best-performing model.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>ROC-AUC: receiver operating characteristics-area under the curve.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>AUPRC: area under the precision-recall curve.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>FPR: false positive rate.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>FNR: false negative rate.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>MCC: Matthews correlation coefficient.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>LR: penalized logistic regression.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>ICU: intensive care unit.</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>ED: emergency department.</p>
            </fn>
            <fn id="table3fn10">
              <p><sup>j</sup>The best value for each performance metric is italicized.</p>
            </fn>
            <fn id="table3fn11">
              <p><sup>k</sup>HCF: health care facility.</p>
            </fn>
            <fn id="table3fn12">
              <p><sup>l</sup>For “Hierarchical model” (last row), the highlighted metric (in italics) indicates comparatively better performance than most of the other subpopulations.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Effect of the Imbalanced Data Set</title>
        <p>We evaluated the performance achieved using the different sampling techniques discussed earlier. First, as in the study by Hartvigsen et al [<xref ref-type="bibr" rid="ref8">8</xref>], we used a random selection-based down-sampling technique to select majority-class observations and balance the number of observations between the majority and minority classes. The balanced data are split into train and test data. The ROC-AUC score of the best-performing model on the test data is 0.731. We used the synthetic minority oversampling technique (SMOTE) [<xref ref-type="bibr" rid="ref21">21</xref>] on our data set to balance both majority and minority classes. The ROC-AUC score of the best-performing model on the test data is 0.896. Similar to the study by Hirano et al [<xref ref-type="bibr" rid="ref9">9</xref>], we used SMOTE to balance the majority and minority classes in the imbalanced train and test data. The ROC-AUC score of the best-performing model on the test data is 0.903. However, when we evaluated the performance of the abovementioned models on a random test data set, the ROC-AUC score was significantly lower at 0.701. Thus, for our problem, the biased sampling techniques did not improve performance.</p>
      </sec>
      <sec>
        <title>Subpopulation-Specific Results</title>
        <p>Our models and feature engineering cannot improve the ROC-AUC of 0.826. We now discuss the results of subpopulation-specific models.</p>
        <sec>
          <title>Testing History–Based Analysis</title>
          <p>The best-fitted model on testing history–based subpopulations (<xref ref-type="table" rid="table4">Table 4</xref>) showed the best performance on three subpopulations: (1) patients with a (−1) testing history: the best-fitted model had an ROC-AUC of 0.802; (2) patients with a (−1, −1) testing history: the best-fitted model had ROC-AUC of 0.848 and FNR of 0.035; (3) patients with a (+1, +1) testing history: the best model, in terms of the area under the precision-recall curve (AUPRC; Qi et al [<xref ref-type="bibr" rid="ref22">22</xref>] suggested this metric for imbalanced data) performance metric, had an AUPRC of 0.910 (<xref rid="figure4" ref-type="fig">Figure 4</xref>B). The results for the other testing history–based data sets are shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
          <p><xref rid="figure4" ref-type="fig">Figure 4</xref>C shows the significant features (using the SHAP technique) for the (−1, −1) testing history–based subpopulations. The topmost feature (“MRSA 14”) is a network-based feature. Moreover, the network-based features are among the top 10 features. Among these features, “MRSA 7” and “MRSA 14” are positively associated with MRSA infection. In addition to the network features, the interval between the 2 MRSA tests is also important. In addition, patient comorbidity conditions have a significant correlation with MRSA infection.</p>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Performance metrics for the best-performing model for each patient subpopulation based on testing history.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="120"/>
              <col width="100"/>
              <col width="80"/>
              <col width="110"/>
              <col width="100"/>
              <col width="100"/>
              <col width="80"/>
              <col width="90"/>
              <col width="70"/>
              <col width="80"/>
              <col width="70"/>
              <thead>
                <tr valign="top">
                  <td>Testing history</td>
                  <td>Model<sup>a</sup></td>
                  <td>ROC-AUC<sup>b</sup></td>
                  <td>AUPRC<sup>c</sup></td>
                  <td>Sensitivity</td>
                  <td>Specificity</td>
                  <td>Precision</td>
                  <td>FPR<sup>d</sup> or fall out</td>
                  <td>FNR<sup>e</sup></td>
                  <td><italic>F</italic><sub>1</sub>-score</td>
                  <td>MCC<sup>f</sup> score</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>None</td>
                  <td>LR<sup>g</sup></td>
                  <td>0.814</td>
                  <td>0.406</td>
                  <td>0.689</td>
                  <td>0.749</td>
                  <td>0.276</td>
                  <td>0.251</td>
                  <td>0.054</td>
                  <td>0.394</td>
                  <td>0.311</td>
                </tr>
                <tr valign="top">
                  <td>(<italic>−</italic>1)</td>
                  <td>GB<sup>h</sup></td>
                  <td>0.802</td>
                  <td>0.331</td>
                  <td>0.281</td>
                  <td>
                    <italic>0.953</italic>
                    <sup>i</sup>
                  </td>
                  <td>0.400</td>
                  <td>
                    <italic>0.047</italic>
                  </td>
                  <td>0.078</td>
                  <td>0.330</td>
                  <td>0.274</td>
                </tr>
                <tr valign="top">
                  <td>(+1)</td>
                  <td>LR</td>
                  <td>0.718</td>
                  <td>0.884</td>
                  <td>0.649</td>
                  <td>0.651</td>
                  <td>0.847</td>
                  <td>0.349</td>
                  <td>0.615</td>
                  <td>
                    <italic>0.735</italic>
                  </td>
                  <td>0.264</td>
                </tr>
                <tr valign="top">
                  <td>(−1,−1)</td>
                  <td>LR</td>
                  <td>
                    <italic>0.848</italic>
                  </td>
                  <td>0.402</td>
                  <td>0.697</td>
                  <td>0.855</td>
                  <td>0.332</td>
                  <td>0.145</td>
                  <td>
                    <italic>0.035</italic>
                  </td>
                  <td>0.449</td>
                  <td>
                    <italic>0.404</italic>
                  </td>
                </tr>
                <tr valign="top">
                  <td>(<italic>−</italic>1<italic>,</italic> +1)</td>
                  <td>SV<sup>j</sup></td>
                  <td>0.613</td>
                  <td>0.781</td>
                  <td>0.295</td>
                  <td>0.897</td>
                  <td>0.867</td>
                  <td>0.103</td>
                  <td>0.639</td>
                  <td>0.441</td>
                  <td>0.209</td>
                </tr>
                <tr valign="top">
                  <td>(+1<italic>, −</italic>1)</td>
                  <td>SV</td>
                  <td>0.558</td>
                  <td>0.614</td>
                  <td>
                    <italic>0.875</italic>
                  </td>
                  <td>0.031</td>
                  <td>0.311</td>
                  <td>0.969</td>
                  <td>0.667</td>
                  <td>0.459</td>
                  <td>0.183</td>
                </tr>
                <tr valign="top">
                  <td>(+1<italic>,</italic> +1)</td>
                  <td>LR</td>
                  <td>0.761</td>
                  <td>
                    <italic>0.910</italic>
                  </td>
                  <td>0.595</td>
                  <td>0.787</td>
                  <td>
                    <italic>0.916</italic>
                  </td>
                  <td>0.213</td>
                  <td>0.667</td>
                  <td>0.721</td>
                  <td>0.308</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>The “Model” column specifies the best-performing model (LR=penalized logistic regression classifier, GB=gradient boosting, and SV=support vector).</p>
              </fn>
              <fn id="table4fn2">
                <p><sup>b</sup>ROC-AUC: receiver operating characteristics-area under the curve.</p>
              </fn>
              <fn id="table4fn3">
                <p><sup>c</sup>AUPRC: area under the precision-recall curve.</p>
              </fn>
              <fn id="table4fn4">
                <p><sup>d</sup>FPR: false positive rate.</p>
              </fn>
              <fn id="table4fn5">
                <p><sup>e</sup>FNR: false negative rate.</p>
              </fn>
              <fn id="table4fn6">
                <p><sup>f</sup>MCC: Matthews correlation coefficient.</p>
              </fn>
              <fn id="table4fn7">
                <p><sup>g</sup>LR: logistic regression.</p>
              </fn>
              <fn id="table4fn8">
                <p><sup>h</sup>GB: gradient boosting.</p>
              </fn>
              <fn id="table4fn9">
                <p><sup>i</sup>The best value for each performance metric is italicized.</p>
              </fn>
              <fn id="table4fn10">
                <p><sup>j</sup>SV: support vector.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Results for best-performing subpopulations based on testing history: (A) Performance (receiver operating characteristics-area under the curve [ROC-AUC]) of different machine learning models for testing history (−1, −1), that is, the last 2 testing results are negative—penalized logistic regression (LR) has the best performance. (B) Performance (area under the precision-recall curve [AUPRC]) of different machine learning models for testing history (+1, +1), that is, the last 2 testing results are positive—penalized LR has the best performance. (C) Top features for (−1, −1) testing history–based subpopulation using the LR model. GBC: gradient boosted classifier; RF: random forest; SVC: support vector classifier.</p>
            </caption>
            <graphic xlink:href="ai_v3i1e48067_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Analysis for ICU and ED Subpopulations</title>
          <p>We developed models for other subpopulations, and the performance of the best-fitted models for these subpopulations is reported in <xref ref-type="table" rid="table3">Table 3</xref>. We found that the best performance is for the ED subpopulation in terms of both ROC-AUC and AUPRC. The ROC-AUC value for the best-fitted model is 0.936 (<xref rid="figure5" ref-type="fig">Figure 5</xref>A), and the AUPRC value for the best-fitted model is 0.882 (<xref rid="figure5" ref-type="fig">Figure 5</xref>B). Regarding the FNR, the model best performs for the subpopulation without antibiotics. The FNR score obtained using the best-performing model for this data set is 0.034. The subpopulation with the second-best performance is the ICU subpopulation (<xref rid="figure6" ref-type="fig">Figure 6</xref>), and the corresponding FNR score is 0.036. The results for the other subpopulations are presented in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p>
          <p><xref rid="figure6" ref-type="fig">Figure 6</xref>B shows the significant features (using the SHAP technique) of the best model for the ICU subpopulation. The top 5 network-based features and the frequency of network features in the top 20 again demonstrate the significance of the network structure. Some of the nonnetwork features that appear to be important are the patient’s age, use of antibiotics in the last 90 days, use of a device in the last 90 days, test duration days, PHARMCLASS 4, and emergency and urgent-type patient admission.</p>
          <p><xref rid="figure5" ref-type="fig">Figure 5</xref>C shows the significant features (using the SHAP technique) for the best-performing model for the ED subpopulation. The top 7 features have network features. The top influential feature for the ICU subpopulation is “MRSA 14,” whereas the top significant feature for the ED subpopulation is “MRSA 7.” Unlike in the ICU, the patient’s gender, length of stay, and comorbidity conditions are also crucial in addition to network features.</p>
          <fig id="figure5" position="float">
            <label>Figure 5</label>
            <caption>
              <p>Results for the emergency department (ED) subpopulation that shows the best performance: (A) performance (receiver operating characteristics-area under the curve [ROC-AUC]) of different machine learning models—penalized logistic regression (LR) has the best performance. (B) Performance (area under the precision-recall curve [AUPRC]) of different machine learning models—penalized LR has the best performance. (C) Top features of the LR model. GBC: gradient boosted classifier; RF: random forest; SHAP: Shapley Additive Explanations; SVC: support vector classifier.</p>
            </caption>
            <graphic xlink:href="ai_v3i1e48067_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure6" position="float">
            <label>Figure 6</label>
            <caption>
              <p>(A) Performance of different machine learning models for the intensive care unit subpopulation; the penalized logistic regression (LR) model performs best. (B) Top features of the LR model. GBC: gradient boosted classifier; RF: random forest; SHAP: Shapley Additive Explanations; SVC: support vector classifier.</p>
            </caption>
            <graphic xlink:href="ai_v3i1e48067_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Hierarchical Models</title>
          <p>The performance of this model is presented in <xref ref-type="table" rid="table3">Table 3</xref>. This model’s ROC-AUC and FNR scores are 0.883 and 0.037, respectively. This model performs better than most subpopulation-based models except for the ED subpopulation-based models.</p>
        </sec>
      </sec>
      <sec>
        <title>Importance of Network Features</title>
        <p>The best-fitted model performance on the entire data set shows the best performance (<xref ref-type="table" rid="table3">Table 3</xref>) regarding ROC-AUC and FNR when we use network features. The corresponding ROC-AUC score is 0.826, and the FNR score is 0.074. Without the network features, the ROC-AUC score for the best-fitted model is 0.714, and the FNR score is 0.107 (<xref ref-type="table" rid="table5">Table 5</xref>).</p>
        <p>The ROC-AUC score improved by approximately 16%, and the FNR score improved by approximately 31% because of the network features. The influence of network features is also significant in the models for the ICU and ED patient subpopulations. The performance metric ROC-AUC improved by approximately 27% for the ICU department patient subpopulation, and the FNR score improved by approximately 58%. For ED patient subpopulations, the performance metric ROC-AUC improved by approximately 30%, the FNR score improved by approximately 69%, and the AUPRC score improved by approximately 50%.</p>
        <p>Network features also improve the performance of the best-fitted model for testing history–based subpopulations (<xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table6">6</xref>).</p>
        <p>The ROC-AUC performance metrics for the best-fitted model (−1) testing the history-based subpopulation improved by approximately 11%. For (−1, −1) testing the history-based subpopulation, the best-fitted model performance improved by approximately 25% on the ROC-AUC score and approximately 35% on the FNR score.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Performance metrics of the best-performing model for each patient subpopulation based on room allocation, admission source, hospital stay, and antibiotic medication period after excluding the network features.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="130"/>
            <col width="90"/>
            <col width="80"/>
            <col width="90"/>
            <col width="110"/>
            <col width="110"/>
            <col width="100"/>
            <col width="70"/>
            <col width="70"/>
            <col width="80"/>
            <col width="70"/>
            <thead>
              <tr valign="top">
                <td>Subpopulation</td>
                <td>Model<sup>a</sup></td>
                <td>AUC<sup>b</sup></td>
                <td>AUPRC<sup>c</sup></td>
                <td>Sensitivity</td>
                <td>Specificity</td>
                <td>Precision</td>
                <td>Fall out</td>
                <td>FNR<sup>d</sup></td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>MCC<sup>e</sup> score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Overall</td>
                <td>LR<sup>f</sup></td>
                <td>0.714</td>
                <td>0.383</td>
                <td>0.610</td>
                <td>0.709</td>
                <td>0.314</td>
                <td>0.291</td>
                <td>0.107</td>
                <td>0.415</td>
                <td>0.257</td>
              </tr>
              <tr valign="top">
                <td>ICU<sup>g</sup></td>
                <td>LR</td>
                <td>0.690</td>
                <td>0.311</td>
                <td>0.547</td>
                <td>0.760</td>
                <td>0.262</td>
                <td>0.240</td>
                <td>0.085</td>
                <td>0.354</td>
                <td>0.233</td>
              </tr>
              <tr valign="top">
                <td>ED<sup>h</sup></td>
                <td>LR</td>
                <td>0.722</td>
                <td>0.589</td>
                <td>0.593</td>
                <td>0.705</td>
                <td>0.496</td>
                <td>0.295</td>
                <td>0.220</td>
                <td>0.541</td>
                <td>0.287</td>
              </tr>
              <tr valign="top">
                <td>Other rooms</td>
                <td>LR</td>
                <td>0.692</td>
                <td>0.346</td>
                <td>0.631</td>
                <td>0.672</td>
                <td>0.308</td>
                <td>0.328</td>
                <td>0.113</td>
                <td>0.414</td>
                <td>0.243</td>
              </tr>
              <tr valign="top">
                <td>From HCF<sup>i</sup></td>
                <td>LR</td>
                <td>0.594</td>
                <td>0.340</td>
                <td>0.348</td>
                <td>0.799</td>
                <td>0.375</td>
                <td>0.201</td>
                <td>0.220</td>
                <td>0.361</td>
                <td>0.151</td>
              </tr>
              <tr valign="top">
                <td>Not from HCF</td>
                <td>LR</td>
                <td>0.721</td>
                <td>0.367</td>
                <td>0.631</td>
                <td>0.704</td>
                <td>0.298</td>
                <td>0.296</td>
                <td>0.095</td>
                <td>0.405</td>
                <td>0.261</td>
              </tr>
              <tr valign="top">
                <td>Hospital stay ≤15 days</td>
                <td>LR</td>
                <td>0.718</td>
                <td>0.381</td>
                <td>0.615</td>
                <td>0.712</td>
                <td>0.311</td>
                <td>0.288</td>
                <td>0.103</td>
                <td>0.413</td>
                <td>0.261</td>
              </tr>
              <tr valign="top">
                <td>Hospital stay &#62;15 days</td>
                <td>LR</td>
                <td>0.595</td>
                <td>0.262</td>
                <td>0.615</td>
                <td>0.566</td>
                <td>0.209</td>
                <td>0.434</td>
                <td>0.112</td>
                <td>0.312</td>
                <td>0.133</td>
              </tr>
              <tr valign="top">
                <td>Antibiotic ≤90 days</td>
                <td>LR</td>
                <td>0.732</td>
                <td>0.402</td>
                <td>0.634</td>
                <td>0.721</td>
                <td>0.336</td>
                <td>0.279</td>
                <td>0.101</td>
                <td>0.439</td>
                <td>0.288</td>
              </tr>
              <tr valign="top">
                <td>Antibiotic &#62;90 days</td>
                <td>LR</td>
                <td>0.707</td>
                <td>0.434</td>
                <td>0.621</td>
                <td>0.683</td>
                <td>0.361</td>
                <td>0.317</td>
                <td>0.138</td>
                <td>0.457</td>
                <td>0.261</td>
              </tr>
              <tr valign="top">
                <td>No antibiotic use</td>
                <td>LR</td>
                <td>0.661</td>
                <td>0.236</td>
                <td>0.520</td>
                <td>0.696</td>
                <td>0.178</td>
                <td>0.304</td>
                <td>
                  <italic>0.080</italic>
                  <sup>j</sup>
                </td>
                <td>0.265</td>
                <td>0.145</td>
              </tr>
              <tr valign="top">
                <td>Age group (0-50 years)</td>
                <td>LR</td>
                <td>0.715</td>
                <td>0.404</td>
                <td>0.617</td>
                <td>0.703</td>
                <td>0.298</td>
                <td>0.297</td>
                <td>0.100</td>
                <td>0.402</td>
                <td>0.251</td>
              </tr>
              <tr valign="top">
                <td>Age group (≥50 years)</td>
                <td>LR</td>
                <td>0.721</td>
                <td>0.357</td>
                <td>0.628</td>
                <td>0.714</td>
                <td>0.295</td>
                <td>0.286</td>
                <td>0.090</td>
                <td>0.401</td>
                <td>0.265</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>The “Model” column specifies the best-performing model (LR=penalized logistic regression classifier).</p>
            </fn>
            <fn id="table5fn2">
              <p><sup>b</sup>AUC: area under the curve.</p>
            </fn>
            <fn id="table5fn3">
              <p><sup>c</sup>AUPRC: area under the precision-recall curve.</p>
            </fn>
            <fn id="table5fn4">
              <p><sup>d</sup>FNR: false negative rate.</p>
            </fn>
            <fn id="table5fn5">
              <p><sup>e</sup>MCC: Matthews correlation coefficient.</p>
            </fn>
            <fn id="table5fn6">
              <p><sup>f</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table5fn7">
              <p><sup>g</sup>ICU: intensive care unit.</p>
            </fn>
            <fn id="table5fn8">
              <p><sup>h</sup>ED: emergency department.</p>
            </fn>
            <fn id="table5fn9">
              <p><sup>i</sup>HCF: health care facility.</p>
            </fn>
            <fn id="table5fn10">
              <p><sup>j</sup>italics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Performance metrics for the best-performing model for each patient subpopulation based on testing history after excluding the network features.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="120"/>
            <col width="90"/>
            <col width="90"/>
            <col width="90"/>
            <col width="100"/>
            <col width="90"/>
            <col width="100"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <thead>
              <tr valign="top">
                <td>Testing history</td>
                <td>Model<sup>a</sup></td>
                <td>AUC<sup>b</sup></td>
                <td>AUPRC<sup>c</sup></td>
                <td>Sensitivity</td>
                <td>Specificity</td>
                <td>Precision</td>
                <td>Fall out</td>
                <td>FNR<sup>d</sup></td>
                <td><italic>F</italic><sub>1</sub>-score</td>
                <td>MCC<sup>e</sup> score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>None</td>
                <td>LR<sup>f</sup></td>
                <td>0.660</td>
                <td>0.221</td>
                <td>0.565</td>
                <td>0.660</td>
                <td>0.187</td>
                <td>0.340</td>
                <td>0.084</td>
                <td>0.281</td>
                <td>0.153</td>
              </tr>
              <tr valign="top">
                <td>(−1)</td>
                <td>GB<sup>g</sup></td>
                <td>0.723</td>
                <td>0.233</td>
                <td>0.031</td>
                <td>0.996</td>
                <td>0.467</td>
                <td>0.004</td>
                <td>0.098</td>
                <td>0.058</td>
                <td>0.099</td>
              </tr>
              <tr valign="top">
                <td>(+1)</td>
                <td>LR</td>
                <td>0.685</td>
                <td>0.851</td>
                <td>0.623</td>
                <td>0.628</td>
                <td>0.821</td>
                <td>0.372</td>
                <td>0.620</td>
                <td>0.708</td>
                <td>0.224</td>
              </tr>
              <tr valign="top">
                <td>(−1, −1)</td>
                <td>LR</td>
                <td>0.677</td>
                <td>0.196</td>
                <td>0.663</td>
                <td>0.615</td>
                <td>0.151</td>
                <td>0.385</td>
                <td>0.054</td>
                <td>0.246</td>
                <td>0.164</td>
              </tr>
              <tr valign="top">
                <td>(−1, +1)</td>
                <td>SV<sup>h</sup></td>
                <td>0.637</td>
                <td>0.797</td>
                <td>0.625</td>
                <td>0.615</td>
                <td>0.786</td>
                <td>0.385</td>
                <td>0.579</td>
                <td>0.696</td>
                <td>0.223</td>
              </tr>
              <tr valign="top">
                <td>(+1, −1)</td>
                <td>SV</td>
                <td>0.507</td>
                <td>0.356</td>
                <td>0.375</td>
                <td>0.656</td>
                <td>0.353</td>
                <td>0.344</td>
                <td>0.323</td>
                <td>0.364</td>
                <td>0.031</td>
              </tr>
              <tr valign="top">
                <td>(+1, +1)</td>
                <td>LR</td>
                <td>0.691</td>
                <td>0.881</td>
                <td>0.605</td>
                <td>0.719</td>
                <td>0.887</td>
                <td>0.281</td>
                <td>0.667</td>
                <td>0.719</td>
                <td>0.267</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>The “Model” column specifies the best-performing model (LR=penalized logistic regression, GB=gradient boosting, and SV=support vector).</p>
            </fn>
            <fn id="table6fn2">
              <p><sup>b</sup>AUC: area under the curve.</p>
            </fn>
            <fn id="table6fn3">
              <p><sup>c</sup>AUPRC: area under the precision-recall curve.</p>
            </fn>
            <fn id="table6fn4">
              <p><sup>d</sup>FNR: false negative rate.</p>
            </fn>
            <fn id="table6fn5">
              <p><sup>e</sup>MCC: Matthews correlation coefficient.</p>
            </fn>
            <fn id="table6fn6">
              <p><sup>f</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table6fn7">
              <p><sup>g</sup>GB: gradient boosting.</p>
            </fn>
            <fn id="table6fn8">
              <p><sup>h</sup>SV: support vector.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>d-Days Ahead Model Prediction</title>
        <p>We now examine how well the test results can be predicted per the <italic>d</italic>-days ahead model. We expected the performance to drop as <italic>d</italic> increases, as shown in <xref rid="figure7" ref-type="fig">Figure 7</xref>, which shows the ROC-AUC score of the best-fitted model (for the data set corresponding to <italic>d</italic>-days before the test, as described in the Methods section) versus <italic>d</italic>. Note that the performance decays significantly with <italic>d</italic>.</p>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p>d-days ahead prediction: performance (receiver operating characteristics-area under the curve [ROC-AUC]) of best model versus d. The performance drops gradually with d.</p>
          </caption>
          <graphic xlink:href="ai_v3i1e48067_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our results demonstrate that clinically relevant models can be developed for predicting MRSA test results with high accuracy using a combination of clinical and nonclinical features from EHR data. In particular, features of contact networks (eg, “MRSA 7,” “MRSA 14,” “Provider 7,” and “Provider 14”) constructed from EHR data are quite significant in our models. <xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref> show the performance of the models on the same group of data sets without considering the network features. The empirical results establish that the network features have a significant impact (model performance ROC-AUC improves by &#62; 15%) on MRSA infection prediction.</p>
        <p>We took the simplest approach to network construction, which views edges as unweighted, and did not consider heterogeneity in contacts, for example, based on types of providers. It is interesting that even the simplest approach improves performance. While more characteristics of networks and edge weights could be considered and these might improve the performance, the value of our simple approach is that it is easier to construct and is likely more generalizable and robust because there might be uncertainties in some of these additional characteristics.</p>
        <p>In addition to network features, we observed that features associated with antibiotic use (“Antibiotic days”, “Antibiotic days in last 90 days”, “Antibiotic days in last 90+ days”, “PHARMCLASS_1” to “PHARMCLASS_10”, etc.), different kinds of events in the past 90 days (eg, kidney dialysis, device use, and any surgery), and comorbidity conditions such as diabetes without complications (diab or diabunc), hypothyroidism (hypothy), uncomplicated hypertension (hypunc), the Charlson score, the Elixhauser score, the weighted version of the Elixhauser score using the van Walraven algorithm (wscore vw), the weighted version of the Elixhauser score using the <italic>Agency for Healthcare Research and Quality</italic> (AHRQ) algorithm (wscore ahrq), and the weighted version of the Charlson score (wscore) are also predictive; many of these have been identified as important in prior work.</p>
        <p>The penalized LR model with degree-2 polynomial features performs best in almost all settings, using a new class of network-based features derived from EHR data. Our results also showed the utility of heterogeneous models for different subpopulations instead of just one model for the entire population. In particular, we obtained good performance for subpopulations in an ICU or ED and those with certain test histories. We also observed that the performance degrades gradually for a <italic>d</italic>-days ahead prediction.</p>
        <p>The testing policy is fairly systematic for patients in the ICU. Therefore, we expect the model for ICU subpopulations to be quite robust and generalizable to data sets from other locations. On the other hand, it is important to note that testing in the entire patient population is generally not completely systematic and might have biases because it is administered per physician request. It is unclear what the impact of these biases would be on the model’s generalizability. A mitigating factor is that the model for the entire population is quite close to that for the ICU, and many of the significant factors are the same. This suggests that the model for the entire population might also be quite robust. Future studies on other data sets are required to determine the generalizability of these models.</p>
        <p>Our prediction model for a patient on day t only used features that were available for that patient before day t. This included the network features. Therefore, if a patient was in the hospital for &#60;7 days, the “MRSA 7” and “Provider 7” feature values will be 0, and if a patient was in the hospital for &#60;14 days, the “MRSA 14” and “Provider 14” feature values will be 0. It is possible that the predictive model would be more informative for patients who have a longer history in the hospital, but even this is an important patient population from a clinical perspective.</p>
        <p>Finally, we noted that the simple penalized LR model seems to work quite well when given more complex features, such as second-degree features. It is not completely clear why this works much better than the other methods, namely support vector machine, random forest, gradient-boosted classifiers, and XGBoost. One possible explanation can be because of the model parsimony of the penalized LR. Further research on model validation can be useful. One advantage of our analysis is that the penalized LR method is easy to interpret.</p>
        <p>Our models are the most useful for clinical decisions about empiric antibiotic use. For instance, if the test prediction is negative, a clinician could be more comfortable starting an antibiotic treatment. If the test prediction is positive in the context of a newly identified infection, a clinician might consider the benefits of starting an anti-MRSA antibiotic. Isolation precautions are known to have many adverse effects (eg, fewer clinician visits to the room, patient depression, and noninfectious adverse events such as blood clots), although they help in reducing transmission. If the <italic>d</italic>-days ahead result is negative in a current patient with a positive MRSA result, an epidemiologist may adjust for an earlier test for clearance of isolation precautions.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>Machine learning using EHR data for clinical informatics is a very active area of research [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Diverse kinds of statistical and machine learning methods, including deep-learning algorithms, have been used to predict important clinical events (eg, hypertension, diabetes, chronic obstructive pulmonary disease, arrhythmia, asthma, gastritis, dementia, delirium, <italic>Clostridium difficile</italic> infection, and HAIs) using EHR data [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. In the context of HAIs, risk-prediction models have been developed for several MDROs. We have briefly discussed examples of such studies to illustrate the types of questions and methods that have been considered, with a focus on MRSA.</p>
        <p>Hartvigsen et al [<xref ref-type="bibr" rid="ref8">8</xref>] and Hirano et al [<xref ref-type="bibr" rid="ref9">9</xref>] studied a similar problem, namely, predicting MRSA test outcomes, using the Medical Information Mart for Intensive Care III and IV data sets, respectively. These data sets are critical care data sets comprising 12 years (2001 to 2012 and 2008 to 2019, respectively) of patient records from the Beth Israel Deaconess Medical Center Intensive Care Unit in Boston, Massachusetts [<xref ref-type="bibr" rid="ref11">11</xref>]. Hartvigsen et al [<xref ref-type="bibr" rid="ref8">8</xref>] show high performance for the prediction of MRSA test outcomes 1 day ahead using subsampled data. Hirano et al [<xref ref-type="bibr" rid="ref9">9</xref>] achieve high performance (an ROC-AUC value of 0.89) for a slightly different patient subpopulation using the SMOTE [<xref ref-type="bibr" rid="ref21">21</xref>] technique for handling data imbalance. Rhodes et al [<xref ref-type="bibr" rid="ref12">12</xref>] consider a slightly different question regarding MRSA infection 72 hours after admission. They show that the Classification Tree Analysis has good performance for the population of patients from the Northwestern Memorial Hospital and Lake Forest Hospital. A review by Tang et al [<xref ref-type="bibr" rid="ref13">13</xref>] notes that penalized LR, decision tree, and random forest are the preferred methods for antimicrobial resistance prediction.</p>
        <p>A significant challenge hern all MRSA risk-prediction problems (including our study) is that the data are quite imbalanced because the fraction of positive observations is quite small. Consequently, the performance of most machine learning methods can be affected. A common strategy to address this issue has been to construct data sets using different kinds of sampling techniques, including biased sampling [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>] and SMOTE [<xref ref-type="bibr" rid="ref30">30</xref>]. While this kind of approach can appear to have very good performance on a similarly constructed test data set, the true performance on an unbiased data set might be reduced (as discussed in the study by Pencina et al [<xref ref-type="bibr" rid="ref31">31</xref>] and in our Results section), which impacts its performance when used in practice. According to the study by Soltanzadeh and Hashemzadeh [<xref ref-type="bibr" rid="ref30">30</xref>], resolving the class distribution problem using synthetic or biased data constructed in this manner causes many issues such as (1) generalization problems because of noisy samples; (2) uninformative samples; and (3) newly created points being close to the minority class points, which often create points around the decision boundary. Azizi et al [<xref ref-type="bibr" rid="ref32">32</xref>] and Kokosi and Harron [<xref ref-type="bibr" rid="ref33">33</xref>] note that (1) the use of synthetic data in the decision-making process and (2) the problem of attribute disclosure are other limitations of using synthetic data.</p>
        <p>Our study differs from prior work in 3 ways. First, we used network features in addition to other EHR-based features in our risk-prediction models. It has been shown that network properties are predictive of infection risk, for example, Klein et al [<xref ref-type="bibr" rid="ref34">34</xref>] showed that patient degree is associated with vancomycin-resistant enterococci risk. Similarly, Riaz et al [<xref ref-type="bibr" rid="ref35">35</xref>] show that local colonization pressure, which is based on the network structure, is associated with <italic>C. difficile</italic> infection (CDI) risk. Similarly, Miller et al [<xref ref-type="bibr" rid="ref36">36</xref>] show that household exposure (which can also be viewed as a network effect) increases CDI risk. However, our work is the first to explicitly consider EHR-based features for MRSA test prediction as a machine learning task that can be used in a clinical setting. Second, we identified heterogeneous models for specific patient subgroups and showed that these have significantly better performance. Finally, we developed our prediction models without any biased sampling techniques.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>We have not been able to improve the ROC-AUC performance of our models above 0.90. Data imbalance and patient diversity could be significant reasons for this performance. As noted earlier, MRSA infections are fairly rare, and for the problem of MRSA test results, only about 15% of the results are positive. We also note that there are many other notions of MRSA risk, such as the risk of severe outcomes and MRSA acquisition, which we study here. These notions are harder to formalize and learn because the data sets would become even more biased than what we consider here, and new methods are needed for them.</p>
        <p>While our results show that network features are the most predictive, there might be uncertainties in inferring them from the EHR data. We note that these (eg, the #providers within a time interval) are not directly available in the patient’s EHR data; we are inferring them through colocation information. It is possible that many interactions are not recorded accurately or the times might not be accurate. More work is needed to fully understand the impact of these uncertainties.</p>
        <p>Another issue is the testing bias. As discussed earlier, the entire patient population data set has biases because testing is not very systematic in general. This might have an impact on the model’s performance when applied to data sets from other hospitals, and the model would have to be retrained. However, the model structure and specific features might still be relevant, especially because they hold for the ICU patient subpopulation, for which testing is more systematic.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Preprocessing by clustering has been useful in many applications. One challenge in using this approach is that a distance metric needs to be defined, which is difficult due to the diversity of features. For instance, some features are datetime related, some are Boolean and categorical, while others are real valued. A possible extension is to transform the features into a latent space, where distances can be computed. Additional feature engineering and more advanced machine learning methods might be useful for further improving performance. In particular, text analysis might be helpful in further improving the performance.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Machine learning model evaluation metrics.</p>
        <media xlink:href="ai_v3i1e48067_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 174 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Machine learning models.</p>
        <media xlink:href="ai_v3i1e48067_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 91 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Test history–based results and machine learning model hyperparameters.</p>
        <media xlink:href="ai_v3i1e48067_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 605 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Patient subpopulation-based results and machine learning model hyperparameters.</p>
        <media xlink:href="ai_v3i1e48067_app4.pdf" xlink:title="PDF File  (Adobe PDF File), 989 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUPRC</term>
          <def>
            <p>area under the precision-recall curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ED</term>
          <def>
            <p>emergency department</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">FNR</term>
          <def>
            <p>false negative rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">HAI</term>
          <def>
            <p>health care–associated infection</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ICU</term>
          <def>
            <p>intensive care unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MDRO</term>
          <def>
            <p>multidrug-resistant organism</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MRSA</term>
          <def>
            <p>methicillin-resistant Staphylococcus aureus</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">ROC-AUC</term>
          <def>
            <p>receiver operating characteristics-area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">SHAP</term>
          <def>
            <p>Shapley Additive Explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SMOTE</term>
          <def>
            <p>synthetic minority oversampling technique</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">UVA</term>
          <def>
            <p>University of Virginia</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was partially supported by the Centers for Disease Control and Prevention MInD-Healthcare Program (grant U01CK000589) and NSF grants CCF-1918656 and IIS-1955797. GM is an iTHRIV scholar. The iTHRIV Scholars Program is supported in part by the National Center for Advancing Translational Sciences of the National Institutes of Health under award numbers UL1TR003015 and KL2TR003016.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shallcross</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Davies</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>The World Health Assembly resolution on antimicrobial resistance</article-title>
          <source>J Antimicrob Chemother</source>
          <year>2014</year>
          <month>11</month>
          <volume>69</volume>
          <issue>11</issue>
          <fpage>2883</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="doi">10.1093/jac/dku346</pub-id>
          <pub-id pub-id-type="medline">25204342</pub-id>
          <pub-id pub-id-type="pii">dku346</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weiner-Lastinger</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Abner</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Edwards</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Kallen</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Karlsson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Magill</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Pollock</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>See</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Soe</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Walters</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Dudeck</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Antimicrobial-resistant pathogens associated with adult healthcare-associated infections: summary of data reported to the National Healthcare Safety Network, 2015-2017</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>2020</year>
          <month>01</month>
          <volume>41</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>18</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31767041"/>
          </comment>
          <pub-id pub-id-type="doi">10.1017/ice.2019.296</pub-id>
          <pub-id pub-id-type="medline">31767041</pub-id>
          <pub-id pub-id-type="pii">S0899823X19002964</pub-id>
          <pub-id pub-id-type="pmcid">PMC8276252</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zimlichman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Henderson</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tamir</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Franz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Yamin</surname>
              <given-names>CK</given-names>
            </name>
            <name name-style="western">
              <surname>Keohane</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Denham</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>DW</given-names>
            </name>
          </person-group>
          <article-title>Health care-associated infections: a meta-analysis of costs and financial impact on the US health care system</article-title>
          <source>JAMA Intern Med</source>
          <year>2013</year>
          <month>12</month>
          <volume>173</volume>
          <issue>22</issue>
          <fpage>2039</fpage>
          <lpage>46</lpage>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2013.9763</pub-id>
          <pub-id pub-id-type="medline">23999949</pub-id>
          <pub-id pub-id-type="pii">1733452</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="web">
          <article-title>2019 AR threats report</article-title>
          <source>Centers for Disease Control and Prevention</source>
          <access-date>2024-04-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/drugresistance/biggest-threats.html">https://www.cdc.gov/drugresistance/biggest-threats.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>Core elements of hospital antibiotic stewardship programs</article-title>
          <source>Centers for Disease Control and Prevention</source>
          <access-date>2024-04-04</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/antibiotic-use/core-elements/hospital.html#:~:text=Reporting%3A%20Regularly%20report%20information%20o,antibiotic%20resistance%20and%20optimal%20prescribing">https://www.cdc.gov/antibiotic-use/core-elements/hospital.html#:~:text=Reporting%3A%20Regularly%20report%20information%20on,an tibiotic%20resistance%20and%20optimal%20prescribing</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shang</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Goetz</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Diagnosis of MRSA with neural networks and logistic regression approach</article-title>
          <source>Health Care Manag Sci</source>
          <year>2000</year>
          <month>09</month>
          <volume>3</volume>
          <issue>4</issue>
          <fpage>287</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.1023/a:1019018129822</pub-id>
          <pub-id pub-id-type="medline">11105415</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dutta</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dutta</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>“Maximum probability rule” based classification of MRSA infections in hospital environment: using electronic nose</article-title>
          <source>Sens Actuators B Chem</source>
          <year>2006</year>
          <month>12</month>
          <day>14</day>
          <volume>120</volume>
          <issue>1</issue>
          <fpage>156</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.1016/j.snb.2006.02.013</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hartvigsen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Brownell</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Teeple</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Rundensteiner</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Early prediction of MRSA infections using electronic health records</article-title>
          <source>Proceedings of the 11th International Conference on Health Informatics</source>
          <year>2018</year>
          <conf-name>HEALTHINF 2018</conf-name>
          <conf-date>January 19-21, 2018</conf-date>
          <conf-loc>Madeira, Portugal</conf-loc>
          <pub-id pub-id-type="doi">10.5220/0006599601560167</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirano</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shinmoto</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Okada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Suga</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bombard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Murahata</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shrestha</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ocheja</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tanaka</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Machine learning approach to predict positive screening of Methicillin-resistant Staphylococcus aureus during mechanical ventilation using synthetic dataset from MIMIC-IV database</article-title>
          <source>Front Med (Lausanne)</source>
          <year>2021</year>
          <month>11</month>
          <day>16</day>
          <volume>8</volume>
          <fpage>694520</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34869405"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fmed.2021.694520</pub-id>
          <pub-id pub-id-type="medline">34869405</pub-id>
          <pub-id pub-id-type="pmcid">PMC8635043</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>YE</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>YC</given-names>
            </name>
            <name name-style="western">
              <surname>Muder</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>Validation study of artificial neural network models for prediction of methicillin-resistant Staphylococcus aureus carriage</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>2008</year>
          <month>07</month>
          <volume>29</volume>
          <issue>7</issue>
          <fpage>607</fpage>
          <lpage>14</lpage>
          <pub-id pub-id-type="doi">10.1086/588588</pub-id>
          <pub-id pub-id-type="medline">18549315</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LW</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rhodes</surname>
              <given-names>NJ</given-names>
            </name>
            <name name-style="western">
              <surname>Rohani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yarnold</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>Pawlowski</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Malczynski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sutton</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Zembower</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Wunderink</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>Machine learning to stratify methicillin-resistant staphylococcus aureus risk among hospitalized patients with community-acquired pneumonia</article-title>
          <source>Antimicrob Agents Chemother</source>
          <year>2023</year>
          <month>01</month>
          <day>24</day>
          <volume>67</volume>
          <issue>1</issue>
          <fpage>e0102322</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36472425"/>
          </comment>
          <pub-id pub-id-type="doi">10.1128/aac.01023-22</pub-id>
          <pub-id pub-id-type="medline">36472425</pub-id>
          <pub-id pub-id-type="pmcid">PMC9872682</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Machine learning in predicting antimicrobial resistance: a systematic review and meta-analysis</article-title>
          <source>Int J Antimicrob Agents</source>
          <year>2022</year>
          <volume>60</volume>
          <issue>5-6</issue>
          <fpage>106684</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijantimicag.2022.106684</pub-id>
          <pub-id pub-id-type="medline">36279973</pub-id>
          <pub-id pub-id-type="pii">S0924-8579(22)00211-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shenoy</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Noubary</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenberg</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Cotter</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Walensky</surname>
              <given-names>RP</given-names>
            </name>
            <name name-style="western">
              <surname>Hooper</surname>
              <given-names>DC</given-names>
            </name>
          </person-group>
          <article-title>Concordance of PCR and culture from nasal swabs for detection of methicillin-resistant Staphylococcus aureus in a setting of concurrent antistaphylococcal antibiotics</article-title>
          <source>J Clin Microbiol</source>
          <year>2014</year>
          <month>04</month>
          <volume>52</volume>
          <issue>4</issue>
          <fpage>1235</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24452168"/>
          </comment>
          <pub-id pub-id-type="doi">10.1128/JCM.02972-13</pub-id>
          <pub-id pub-id-type="medline">24452168</pub-id>
          <pub-id pub-id-type="pii">JCM.02972-13</pub-id>
          <pub-id pub-id-type="pmcid">PMC3993487</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boyce</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Potter-Bynoe</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chenevert</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Environmental contamination due to methicillin-resistant Staphylococcus aureus: possible infection control implications</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>1997</year>
          <month>09</month>
          <volume>18</volume>
          <issue>9</issue>
          <fpage>622</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="medline">9309433</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Herold</surname>
              <given-names>BC</given-names>
            </name>
            <name name-style="western">
              <surname>Immergluck</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Maranan</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Lauderdale</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Gaskin</surname>
              <given-names>RE</given-names>
            </name>
            <name name-style="western">
              <surname>Boyle-Vavra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Leitch</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Daum</surname>
              <given-names>RS</given-names>
            </name>
          </person-group>
          <article-title>Community-acquired methicillin-resistant Staphylococcus aureus in children with no identified predisposing risk</article-title>
          <source>JAMA</source>
          <year>1998</year>
          <month>02</month>
          <day>25</day>
          <volume>279</volume>
          <issue>8</issue>
          <fpage>593</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.279.8.593</pub-id>
          <pub-id pub-id-type="medline">9486753</pub-id>
          <pub-id pub-id-type="pii">joc71943</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nick</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>KM</given-names>
            </name>
          </person-group>
          <article-title>Logistic regression</article-title>
          <source>Methods Mol Biol</source>
          <year>2007</year>
          <volume>404</volume>
          <fpage>273</fpage>
          <lpage>301</lpage>
          <pub-id pub-id-type="doi">10.1007/978-1-59745-530-5_14</pub-id>
          <pub-id pub-id-type="medline">18450055</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cortes</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Vapnik</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Support-vector networks</article-title>
          <source>Mach Learn</source>
          <year>1995</year>
          <month>9</month>
          <volume>20</volume>
          <fpage>273</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.1007/bf00994018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Leo</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Random forests</article-title>
          <source>Mach Learn</source>
          <year>2001</year>
          <volume>45</volume>
          <fpage>5</fpage>
          <lpage>32</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1023/A:1010933404324"/>
          </comment>
          <pub-id pub-id-type="doi">10.1023/A:1010933404324</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SI</given-names>
            </name>
          </person-group>
          <article-title>A unified approach to interpreting model predictions</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online May 22, 2017</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1705.07874"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chawla</surname>
              <given-names>NV</given-names>
            </name>
            <name name-style="western">
              <surname>Bowyer</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>LO</given-names>
            </name>
            <name name-style="western">
              <surname>Kegelmeyer</surname>
              <given-names>WP</given-names>
            </name>
          </person-group>
          <article-title>SMOTE: synthetic minority over-sampling technique</article-title>
          <source>J Artif Intell Res</source>
          <year>2002</year>
          <month>06</month>
          <day>01</day>
          <volume>16</volume>
          <fpage>321</fpage>
          <lpage>57</lpage>
          <pub-id pub-id-type="doi">10.1613/jair.953</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qi</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Stochastic optimization of areas under precision-recall curves with provable convergence</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online April 18, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.08736"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>MI</given-names>
            </name>
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>TM</given-names>
            </name>
          </person-group>
          <article-title>Machine learning: trends, perspectives, and prospects</article-title>
          <source>Science</source>
          <year>2015</year>
          <month>07</month>
          <day>17</day>
          <volume>349</volume>
          <issue>6245</issue>
          <fpage>255</fpage>
          <lpage>60</lpage>
          <pub-id pub-id-type="doi">10.1126/science.aaa8415</pub-id>
          <pub-id pub-id-type="medline">26185243</pub-id>
          <pub-id pub-id-type="pii">349/6245/255</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wiens</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shenoy</surname>
              <given-names>ES</given-names>
            </name>
          </person-group>
          <article-title>Machine learning for healthcare: on the verge of a major shift in healthcare epidemiology</article-title>
          <source>Clin Infect Dis</source>
          <year>2018</year>
          <month>01</month>
          <day>06</day>
          <volume>66</volume>
          <issue>1</issue>
          <fpage>149</fpage>
          <lpage>53</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29020316"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/cid/cix731</pub-id>
          <pub-id pub-id-type="medline">29020316</pub-id>
          <pub-id pub-id-type="pii">4085880</pub-id>
          <pub-id pub-id-type="pmcid">PMC5850539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhagwat</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Viviano</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Voineskos</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Chakravarty</surname>
              <given-names>MM</given-names>
            </name>
            <collab>Alzheimer’s Disease Neuroimaging Initiative</collab>
          </person-group>
          <article-title>Modeling and prediction of clinical symptom trajectories in Alzheimer's disease using longitudinal data</article-title>
          <source>PLoS Comput Biol</source>
          <year>2018</year>
          <month>09</month>
          <day>14</day>
          <volume>14</volume>
          <issue>9</issue>
          <fpage>e1006376</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pcbi.1006376"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1006376</pub-id>
          <pub-id pub-id-type="medline">30216352</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-17-01878</pub-id>
          <pub-id pub-id-type="pmcid">PMC6157905</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cleret de Langavant</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bayen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Yaffe</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised machine learning to identify high likelihood of dementia in population-based surveys: development and validation study</article-title>
          <source>J Med Internet Res</source>
          <year>2018</year>
          <month>07</month>
          <day>09</day>
          <volume>20</volume>
          <issue>7</issue>
          <fpage>e10493</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2018/7/e10493/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/10493</pub-id>
          <pub-id pub-id-type="medline">29986849</pub-id>
          <pub-id pub-id-type="pii">v20i7e10493</pub-id>
          <pub-id pub-id-type="pmcid">PMC6056741</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Makar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fusco</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>McCaffrey</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ryan</surname>
              <given-names>EE</given-names>
            </name>
            <name name-style="western">
              <surname>Washer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>West</surname>
              <given-names>LR</given-names>
            </name>
            <name name-style="western">
              <surname>Young</surname>
              <given-names>VB</given-names>
            </name>
            <name name-style="western">
              <surname>Guttag</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hooper</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Shenoy</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Wiens</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A generalizable, data-driven approach to predict daily risk of clostridium difficile infection at two large academic health centers</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>2018</year>
          <month>04</month>
          <volume>39</volume>
          <issue>4</issue>
          <fpage>425</fpage>
          <lpage>33</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29576042"/>
          </comment>
          <pub-id pub-id-type="doi">10.1017/ice.2018.16</pub-id>
          <pub-id pub-id-type="medline">29576042</pub-id>
          <pub-id pub-id-type="pii">S0899823X18000168</pub-id>
          <pub-id pub-id-type="pmcid">PMC6421072</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Young</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzales</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Douglas</surname>
              <given-names>VC</given-names>
            </name>
            <name name-style="western">
              <surname>Hadley</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Development and validation of an electronic health record-based machine learning model to estimate delirium risk in newly hospitalized patients without known cognitive impairment</article-title>
          <source>JAMA Netw Open</source>
          <year>2018</year>
          <month>08</month>
          <day>03</day>
          <volume>1</volume>
          <issue>4</issue>
          <fpage>e181018</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30646095"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2018.1018</pub-id>
          <pub-id pub-id-type="medline">30646095</pub-id>
          <pub-id pub-id-type="pii">2695078</pub-id>
          <pub-id pub-id-type="pmcid">PMC6324291</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Clinical assistant diagnosis for electronic medical record based on convolutional neural network</article-title>
          <source>Sci Rep</source>
          <year>2018</year>
          <month>04</month>
          <day>20</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>6329</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-018-24389-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-018-24389-w</pub-id>
          <pub-id pub-id-type="medline">29679019</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-018-24389-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC5910396</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soltanzadeh</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hashemzadeh</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>RCSMOTE: range-controlled synthetic minority over-sampling technique for handling the class imbalance problem</article-title>
          <source>Inf Sci</source>
          <year>2021</year>
          <month>01</month>
          <day>04</day>
          <volume>542</volume>
          <fpage>92</fpage>
          <lpage>111</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ins.2020.07.014</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pencina</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Goldstein</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>D'Agostino</surname>
              <given-names>RB</given-names>
            </name>
          </person-group>
          <article-title>Prediction models - development, evaluation, and clinical application</article-title>
          <source>N Engl J Med</source>
          <year>2020</year>
          <month>04</month>
          <day>23</day>
          <volume>382</volume>
          <issue>17</issue>
          <fpage>1583</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMp2000589</pub-id>
          <pub-id pub-id-type="medline">32320568</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pilote</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <collab>GOING-FWD Collaborators</collab>
          </person-group>
          <article-title>Can synthetic data be a proxy for real clinical trial data? A validation study</article-title>
          <source>BMJ Open</source>
          <year>2021</year>
          <month>04</month>
          <day>16</day>
          <volume>11</volume>
          <issue>4</issue>
          <fpage>e043497</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmjopen.bmj.com/lookup/pmidlookup?view=long&#38;pmid=33863713"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2020-043497</pub-id>
          <pub-id pub-id-type="medline">33863713</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2020-043497</pub-id>
          <pub-id pub-id-type="pmcid">PMC8055130</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kokosi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Harron</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Synthetic data in medical research</article-title>
          <source>BMJ Med</source>
          <year>2022</year>
          <month>09</month>
          <day>26</day>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>e000167</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36936569"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjmed-2022-000167</pub-id>
          <pub-id pub-id-type="medline">36936569</pub-id>
          <pub-id pub-id-type="pii">bmjmed-2022-000167</pub-id>
          <pub-id pub-id-type="pmcid">PMC9951365</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>EY</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>KK</given-names>
            </name>
            <name name-style="western">
              <surname>Hinson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Toerper</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Amoah</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tamma</surname>
              <given-names>PD</given-names>
            </name>
            <name name-style="western">
              <surname>Levin</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Milstone</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>The role of healthcare worker-mediated contact networks in the transmission of vancomycin-resistant enterococci</article-title>
          <source>Open Forum Infect Dis</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>ofaa056</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32166095"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ofid/ofaa056</pub-id>
          <pub-id pub-id-type="medline">32166095</pub-id>
          <pub-id pub-id-type="pii">ofaa056</pub-id>
          <pub-id pub-id-type="pmcid">PMC7060899</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Riaz</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Polgreen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Segre</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sewell</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pemmaraju</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Highly local Clostridioides difficile infection (CDI) pressure as risk factors for CDI</article-title>
          <source>Infect Control Hosp Epidemiol</source>
          <year>2020</year>
          <month>11</month>
          <day>02</day>
          <volume>41</volume>
          <issue>S1</issue>
          <fpage>s250</fpage>
          <pub-id pub-id-type="doi">10.1017/ice.2020.810</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Arakkal</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Sewell</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Segre</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Pemmaraju</surname>
              <given-names>SV</given-names>
            </name>
            <name name-style="western">
              <surname>Polgreen</surname>
              <given-names>PM</given-names>
            </name>
          </person-group>
          <article-title>Risk for asymptomatic household transmission of Clostridioides difficile infection associated with recently hospitalized family members</article-title>
          <source>Emerg Infect Dis</source>
          <year>2022</year>
          <month>05</month>
          <volume>28</volume>
          <issue>5</issue>
          <fpage>932</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3201/eid2805.212023"/>
          </comment>
          <pub-id pub-id-type="doi">10.3201/eid2805.212023</pub-id>
          <pub-id pub-id-type="medline">35447064</pub-id>
          <pub-id pub-id-type="pmcid">PMC9045444</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
