<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v1i1e41030</article-id>
      <article-id pub-id-type="pmid">38875545</article-id>
      <article-id pub-id-type="doi">10.2196/41030</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Chronic Disease Prediction Using the Common Data Model: Development Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Malin</surname>
            <given-names>Bradley</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Finny</surname>
            <given-names>Abraham</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Soerensen </surname>
            <given-names>Simon John Christoph</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kumar</surname>
            <given-names>Vishnu</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Chanjung</given-names>
          </name>
          <degrees>BA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4637-9709</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Jo</surname>
            <given-names>Brian</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5065-6441</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Woo</surname>
            <given-names>Hyunki</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4868-6270</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Im</surname>
            <given-names>Yoori</given-names>
          </name>
          <degrees>MPH</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4837-3781</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Park</surname>
            <given-names>Rae Woong</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Department of Biomedical Informatics</institution>
            <institution>Ajou University Hospital</institution>
            <addr-line>164, World cup-ro, Yeongtong-gu, Suwon-si, Gyeonggi-do</addr-line>
            <addr-line>Suwon, 16499</addr-line>
            <country>Republic of Korea</country>
            <phone>82 01073375540</phone>
            <email>veritas@ajou.ac.kr</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4989-3287</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Park</surname>
            <given-names>ChulHyoung</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0531-9144</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Evidnet</institution>
        <addr-line>Seongnam</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>Ajou University Hospital</institution>
        <addr-line>Suwon</addr-line>
        <country>Republic of Korea</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Rae Woong Park <email>veritas@ajou.ac.kr</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Jan-Dec</season>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>22</day>
        <month>12</month>
        <year>2022</year>
      </pub-date>
      <volume>1</volume>
      <issue>1</issue>
      <elocation-id>e41030</elocation-id>
      <history>
        <date date-type="received">
          <day>13</day>
          <month>7</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>12</day>
          <month>10</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>21</day>
          <month>11</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>26</day>
          <month>11</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Chanjung Lee, Brian Jo, Hyunki Woo, Yoori Im, Rae Woong Park, ChulHyoung Park. Originally published in JMIR AI (https://ai.jmir.org), 22.12.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2022/1/e41030" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Chronic disease management is a major health issue worldwide. With the paradigm shift to preventive medicine, disease prediction modeling using machine learning is gaining importance for precise and accurate medical judgement.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to develop high-performance prediction models for 4 chronic diseases using the common data model (CDM) and machine learning and to confirm the possibility for the extension of the proposed models.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>In this study, 4 major chronic diseases—namely, diabetes, hypertension, hyperlipidemia, and cardiovascular disease—were selected, and a model for predicting their occurrence within 10 years was developed. For model development, the Atlas analysis tool was used to define the chronic disease to be predicted, and data were extracted from the CDM according to the defined conditions. A model for predicting each disease was built with 4 algorithms verified in previous studies, and the performance was compared after applying a grid search.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>For the prediction of each disease, we applied 4 algorithms (logistic regression, gradient boosting, random forest, and extreme gradient boosting), and all models show greater than 80% accuracy. As compared to the optimized model’s performance, extreme gradient boosting presented the highest predictive performance for the 4 diseases (diabetes, hypertension, hyperlipidemia, and cardiovascular disease) with 80% or greater and from 0.84 to 0.93 in area under the curve standards.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study demonstrates the possibility for the preemptive management of chronic diseases by predicting the occurrence of chronic diseases using the CDM and machine learning. With these models, the risk of developing major chronic diseases within 10 years can be demonstrated by identifying health risk factors using our chronic disease prediction machine learning model developed with the real-world data–based CDM and National Health Insurance Corporation examination data that individuals can easily obtain.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>common data model</kwd>
        <kwd>chronic disease</kwd>
        <kwd>prediction model</kwd>
        <kwd>machine learning</kwd>
        <kwd>disease management</kwd>
        <kwd>data model</kwd>
        <kwd>disease prediction</kwd>
        <kwd>prediction</kwd>
        <kwd>risk prediction</kwd>
        <kwd>risk factors</kwd>
        <kwd>health risk</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>World Health Organization’s Global Action Plan (2013-2020) for noninfectious diseases aims to reduce the premature death rate stemming from chronic diseases by 25% by 2025 [<xref ref-type="bibr" rid="ref1">1</xref>]. The plan also urges the establishment of national policies and management of performance indicators.</p>
      <p>Accordingly, the Ministry of Health and Welfare of South Korea has designated cardiovascular disease, diabetes, chronic respiratory disease, and cancer as chronic diseases to be managed by the government [<xref ref-type="bibr" rid="ref2">2</xref>] and established a chronic disease management system centered on local hospitals. In March 2014, a community primary care pilot project for high blood pressure and patients with diabetes was initiated. In September 2016, the chronic disease management pilot project was carried out. From January 2019 to the present, a primary medical chronic disease management pilot project was conducted. Nevertheless, chronic diseases remain the primary cause of mortality and increasing medical expenses. According to the Korea Centers for Disease Control and Prevention, in 2020, chronic diseases were responsible for 7 out of 10 deaths in the country, accounting for 83.7% of total medical expenses [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
      <p>Chronic diseases develop from metabolic syndrome that are caused by lifestyle or individual genetic and environmental factors [<xref ref-type="bibr" rid="ref4">4</xref>]. The development of chronic disease leads to various complications or requires long-term treatment [<xref ref-type="bibr" rid="ref5">5</xref>]. Therefore, it is important to take preemptive measures along with the prevention of metabolic syndrome. In this respect, it is necessary to develop various disease prediction models to reduce the risk of complications and medical costs.</p>
      <p>Fortunately, early-stage disease prediction is gaining momentum with the use of real-world data combined with machine learning technology. Lee et al [<xref ref-type="bibr" rid="ref6">6</xref>] predicted the risk of metabolic syndrome (area under the curve [AUC]=0.879) using machine learning techniques, and Choi et al [<xref ref-type="bibr" rid="ref7">7</xref>] predicted disease occurrence using recurrent neural networks (diagnosis up to 79%). Lipton et al [<xref ref-type="bibr" rid="ref8">8</xref>] predicted the probability of chronic disease by applying long short-term memory (AUC=0.81-0.99). However, the disease occurrence prediction models developed in South Korea used traditional statistical techniques, and most international predictive models were developed for Western White populations and therefore have reduced applicability to other countries and racial groups.</p>
      <p>Although there are already many studies using electronic medical record (EMR) and machine learning, they have limitations in requiring a definition of medical terms or preprocessing for standardization in multicentered studies, entailing that these studies cannot be synchronized with other prediction models. There are relatively few papers on predictive model development using the common data model (CDM; although version 6.0 was release recently, version 5.4 of the Observational Medical Outcomes Partnership CDM is supported by the Observational Health Data Sciences and Informatics suite of tools and methods), which can overcome these limitations. In this paper, we aimed to develop a scalable chronic disease prediction model using the CDM.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Subjects</title>
        <p>We used the data of 790,822 subjects with at least one year of hospital records among subjects aged ≥20 years who had also visited a tertiary hospital in South Korea (Ajou University Hospital in Suwon) from 1999 to 2020. To predict the risk of developing chronic diseases for the subjects as they age, patients with chronic diseases (type 2 diabetes, high blood pressure, hyperlipidemia, and cardiovascular disease) as the underlying disease were excluded (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The process of selecting subjects for the type 2 diabetes study. ALT: alanine aminotransferase; AST: aspartate aminotransferase; CDM: common data model; Cr: creatinine; DBP: diastolic blood pressure; FBS: fasting blood glucose; HDL: high-density lipoprotein; Hgb: hemoglobin; LDL: low-density lipoprotein; SBP: systolic blood pressure; TG: triglyceride.</p>
          </caption>
          <graphic xlink:href="ai_v1i1e41030_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Select Model Variables</title>
        <p>The public health checkup is a test for adults aged &#62;18 years in South Korea, and anyone can use it for free. Variables were selected based on the general examination of items from the National Health Insurance Service. A total of 19 variables were included, such as basic information, measurement information, lifestyle information, and history of diseases.</p>
      </sec>
      <sec>
        <title>Data Extraction</title>
        <p>The data used in the predictive model were extracted using the Atlas analysis tool (Observational Health Data Sciences and Informatics—a nonprofit consortium that allows researchers to perform design, characterization, and analysis). A cohort for chronic diseases was created through Atlas design for the variables used in the cohort. Concept IDs following the Systematized Nomenclature Of Medicine–Clinical Terms terminology were used, which are mapped to the International Classification of Diseases, 10th Revision code and currently used as a diagnostic name in clinical practice. Systematized Nomenclature Of Medicine–Clinical Terms were developed to meet the various needs and expectations of clinicians around the world, and it is an international standard terminology system used in more than 80 countries, helping to consistently express clinical contents in medical information records. Additionally, concept IDs following the Local Laboratory Result Code terminology, mapped with the managed local code, was used. Local Laboratory Result Code refers to international standard test terms, and medical terms are defined and standardized for the standardization of test codes. <xref ref-type="table" rid="table1">Table 1</xref> shows the concept IDs used in the defined cohort group.</p>
        <p>The defined cohort group was divided into a disease-occurring group and a nonoccurring group according to the presence or absence of a diagnosed chronic disease within 10 years from the index date (when the criteria for participation in the study were met). Cohort generation and data extraction were performed according to the design criteria shown in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Concept ID information.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="120"/>
            <col width="480"/>
            <col width="100"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Variables</td>
                <td>Concept ID</td>
                <td>Concept name</td>
                <td>Type</td>
                <td>Vocabulary</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Type 2 diabetes</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>201826</p>
                    </list-item>
                  </list>
                </td>
                <td>Type 2 diabetes mellitus</td>
                <td>Factor</td>
                <td>SNOMED-CT<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>Hypertension</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>316866</p>
                    </list-item>
                  </list>
                </td>
                <td>Hypertensive disorder</td>
                <td>Factor</td>
                <td>SNOMED-CT</td>
              </tr>
              <tr valign="top">
                <td>Hyperlipidemia</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>432867</p>
                    </list-item>
                  </list>
                </td>
                <td>Hyperlipidemia</td>
                <td>Factor</td>
                <td>SNOMED-CT</td>
              </tr>
              <tr valign="top">
                <td>Cardiovascular disease</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>134057</p>
                    </list-item>
                  </list>
                </td>
                <td>Disorder of cardiovascular system</td>
                <td>Factor</td>
                <td>SNOMED-CT</td>
              </tr>
              <tr valign="top">
                <td>BMI</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3038553</p>
                    </list-item>
                  </list>
                </td>
                <td>Body mass index (ratio)</td>
                <td>Numeric</td>
                <td>LOINC<sup>b</sup></td>
              </tr>
              <tr valign="top">
                <td>SBP<sup>c</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3004249</p>
                    </list-item>
                  </list>
                </td>
                <td>Systolic blood pressure</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>DBP<sup>d</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3012888</p>
                    </list-item>
                  </list>
                </td>
                <td>Diastolic blood pressure</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>Total cholesterol</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3027114</p>
                    </list-item>
                  </list>
                </td>
                <td>Cholesterol (mass/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>HDL<sup>e</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3007070</p>
                    </list-item>
                  </list>
                </td>
                <td>Cholesterol in high-density lipoprotein (mass/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>LDL<sup>f</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3028437</p>
                    </list-item>
                  </list>
                </td>
                <td>Cholesterol in low-density lipoprotein (mass/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>TG<sup>g</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3022038</p>
                    </list-item>
                    <list-item>
                      <p>3022192</p>
                    </list-item>
                  </list>
                </td>
                <td>Triglyceride (mass/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>FBS<sup>h</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3040820</p>
                    </list-item>
                    <list-item>
                      <p>36303387</p>
                    </list-item>
                    <list-item>
                      <p>3037110</p>
                    </list-item>
                  </list>
                </td>
                <td>Fasting glucose (mass/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>Hgb<sup>i</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3000963</p>
                    </list-item>
                    <list-item>
                      <p>3027484</p>
                    </list-item>
                  </list>
                </td>
                <td>Hemoglobin (mass/volume) in blood</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>Cr<sup>j</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3016723</p>
                    </list-item>
                    <list-item>
                      <p>3051825</p>
                    </list-item>
                  </list>
                </td>
                <td>Creatinine (mass/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>AST<sup>k</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3013721</p>
                    </list-item>
                  </list>
                </td>
                <td>Aspartate aminotransferase (enzymatic activity/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
              <tr valign="top">
                <td>ALT<sup>l</sup></td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>3006923</p>
                    </list-item>
                    <list-item>
                      <p>46236949</p>
                    </list-item>
                  </list>
                </td>
                <td>Alanine aminotransferase (enzymatic activity/volume) in serum or plasma</td>
                <td>Numeric</td>
                <td>LOINC</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>SNOMED-CT: Systematized Nomenclature Of Medicine–Clinical Terms.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>LONIC: Local Laboratory Result Code.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>SBP: systolic blood pressure.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>DBP: diastolic blood pressure.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>HDL: high-density lipoprotein.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>LDL: low-density lipoprotein.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>TG: triglyceride.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>FBS: fasting blood glucose.</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>Hgb: hemoglobin.</p>
            </fn>
            <fn id="table1fn10">
              <p><sup>j</sup>Cr: creatinine.</p>
            </fn>
            <fn id="table1fn11">
              <p><sup>k</sup>AST: aspartate aminotransferase.</p>
            </fn>
            <fn id="table1fn12">
              <p><sup>l</sup>ALT: alanine aminotransferase.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <boxed-text id="box1" position="float">
          <title>Design criteria.</title>
          <p>
            <bold>Target group</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Patients who visited the hospital from January 1, 1999, to May 31, 2020</p>
            </list-item>
            <list-item>
              <p>Patients with data for 180 days before and after the index date</p>
            </list-item>
            <list-item>
              <p>Patients diagnosed with chronic diseases (type 2 diabetes, hypertension, hyperlipidemia, or cardiovascular disease) within 10 years from the index date</p>
            </list-item>
          </list>
          <p>
            <bold>Comparator group</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Patients who visited the hospital from January 1, 1999, to May 31, 2020</p>
            </list-item>
            <list-item>
              <p>Patients with data for 180 days before and after the index date</p>
            </list-item>
            <list-item>
              <p>Patients who have not been diagnosed with chronic diseases (type 2 diabetes, hypertension, hyperlipidemia, or cardiovascular disease) within 10 years from the index date.</p>
            </list-item>
          </list>
          <p>
            <bold>Exclusion criteria</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>A history of chronic diseases (diabetes, high blood pressure, hyperlipidemia, or cardiovascular disease) for any period before the selection duration</p>
            </list-item>
            <list-item>
              <p>Missing basic information, examination, and questionnaire items that were selected as essential items in the study for the development of the chronic disease prediction model</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Data Preparation</title>
        <p>We used the patient information, medical treatment, and examination data from a tertiary hospital in South Korea for the CDM. If the missing value was a numeric variable, it was replaced with the median of the matching gender for each age group (stratified into 5-year units), and in the case of a categorical variable, it was replaced with the mode of the matching gender for each age group. Since the number of samples between the 2 groups was unbalanced, random undersampling was performed in the nondiabetic group within 10 years to match the size of the diabetic group. Although data balancing can be hidden from the actual prevalence in practice, it ensures model performance for new data by preventing biased learning from highly imbalanced class problems.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>The descriptive statistics of each group (target group and comparator group) are shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Descriptive statistics.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="220"/>
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Feature</td>
                <td>Processed data</td>
                <td>Target</td>
                <td>Comparator</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="5">
                  <bold>Sex, n</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Female</td>
                <td>1157</td>
                <td>586</td>
                <td>571</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Male</td>
                <td>1691</td>
                <td>838</td>
                <td>853</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Age (years), mean (SD)</td>
                <td>47.56 (15.03)</td>
                <td>54.94 (12.50)</td>
                <td>40.17 (13.60)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">BMI, mean (SD)</td>
                <td>24.59 (5.68)</td>
                <td>25.69 (7.05)</td>
                <td>23.50 (3.55)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">SBP<sup>a</sup>, mean (SD)</td>
                <td>128.1 (16.95)</td>
                <td>132.5 (17.49)</td>
                <td>123.6 (14.86)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">DBP<sup>b</sup>, mean (SD) , mean (SD)</td>
                <td>79.29 (11.89)</td>
                <td>81.56 (12.04)</td>
                <td>77.03 (11.16)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Total cholesterol, mean (SD)</td>
                <td>189.4 (39.84)</td>
                <td>192.9 (42.40)</td>
                <td>185.8 (36.76)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">HDL<sup>c</sup>, mean (SD)</td>
                <td>51.45 (12.52)</td>
                <td>47.55 (10.82)</td>
                <td>55.36 (13.01)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">LDL<sup>d</sup>, mean (SD)</td>
                <td>111.9 (29.57)</td>
                <td>114 (28.95)</td>
                <td>109.7 (30.13)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">TG<sup>e</sup>, mean (SD)</td>
                <td>143.1 (55.0)</td>
                <td>145.0 (59.06)</td>
                <td>115.2 (43.63)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">FBS<sup>f</sup>, mean (SD)</td>
                <td>116.4 (45.64)</td>
                <td>136.7 (54.40)</td>
                <td>96.04 (20.31)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Hgb<sup>g</sup>, mean (SD)</td>
                <td>14.27 (1.70)</td>
                <td>14.18 (1.83)</td>
                <td>14.36 (1.56)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Cr<sup>h</sup>, mean (SD)</td>
                <td>0.99 (0.68)</td>
                <td>1.045 (0.92)</td>
                <td>0.93 (0.26)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">AST<sup>i</sup>, mean (SD)</td>
                <td>27.93 (9.64)</td>
                <td>31.71 (8.91)</td>
                <td>24.15 (11.20)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">ALT<sup>j</sup>, mean (SD)</td>
                <td>31.16 (11.89)</td>
                <td>37.46 (8.19)</td>
                <td> 24.87 (14.87)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>SBP: systolic blood pressure.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>DBP: diastolic blood pressure.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>HDL: high-density lipoprotein.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>LDL: low-density lipoprotein.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>TG: triglyceride.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>FBS: fasting blood glucose.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>Hgb: hemoglobin.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>Cr: creatinine.</p>
            </fn>
            <fn id="table2fn9">
              <p><sup>i</sup>AST: aspartate aminotransferase.</p>
            </fn>
            <fn id="table2fn10">
              <p><sup>j</sup>ALT: alanine aminotransferase.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Models</title>
        <sec>
          <title>Overview</title>
          <p>In this study, to select the most suitable model for disease prediction, we used the following 4 algorithms: logistic regression (LR), random forest (RF), gradient boosting model (GBM), and extreme gradient boosting (XGBoost). LR using binary classification in the statistics field and the other 3 machine learning algorithms had shown better performance than similar prior research [<xref ref-type="bibr" rid="ref9">9</xref>]. Afterward, the prediction performance was compared. Model validation was conducted with the same 80% training data and 20% validation data derived from the entire data set. Accuracy, sensitivity, specificity, and AUC were used as model performance indicators. The prediction model flow is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Prediction model flow. CDM: common data model; ML: machine learning; OMOP: Observational Medical Outcomes Partnership; XGBoost: extreme gradient boosting.</p>
            </caption>
            <graphic xlink:href="ai_v1i1e41030_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>LR Algorithm</title>
          <p>LR was devised by Cox [<xref ref-type="bibr" rid="ref10">10</xref>] as a regression model that predicts the probability of the occurrence of an event with respect to a binary dependent variable. Unlike general linear regression analysis, the range of LR is limited to 0-1 because the dependent variable is dichotomous, and the conditional probability of the occurrence of an event also follows a binomial distribution. That is, if the estimated value following the logistic function satisfying the above assumption is less than 0.5, the predicted value is classified as “nonoccurring,” and if it is greater than 0.5, then the predicted value is classified as “occurring.” Although LR was developed in 1970, it is still being used for statistical analysis and predictive research in various fields.</p>
        </sec>
        <sec>
          <title>RF Algorithm</title>
          <p>RF is a tree-based ensemble model capable of both classification and regression and selects the most appropriate forest model by collecting the results of randomly generated independent decision trees [<xref ref-type="bibr" rid="ref11">11</xref>]. Bagging-based training data inputted to the tree provides model diversity, and the randomness of variable combinations constituting the tree can prevent model noise and the risk of overfitting. The fact that RF is less sensitive to missing values than other algorithms is also an advantage.</p>
        </sec>
        <sec>
          <title>GBM Algorithm</title>
          <p>GBM is a tree-based ensemble model similar to RF, but unlike RF, it creates a tree using a boosting method. The boosting method increases the performance of classification or prediction by sequentially combining several small models [<xref ref-type="bibr" rid="ref12">12</xref>]. GBM reduces the errors generated by the previous model. Although GBM shows high performance in prediction, it may take a lot of time to fit the model because training requires extensive computation. In recent years, GBM-based algorithms such as LightGBM [<xref ref-type="bibr" rid="ref13">13</xref>], CatBoost, and XGBoost have been developed to overcome the shortcomings of GBM.</p>
        </sec>
        <sec>
          <title>XGBoost Algorithm</title>
          <p>XGBoost is a representative tree-based ensemble model devised by Chen and Guestrin [<xref ref-type="bibr" rid="ref14">14</xref>]. It is a machine learning algorithm actively used in prediction and classification research because of its powerful performance and has many advantages such as fast learning due to parallel processing, overfitting regulation, and linkage with other algorithms. Since XGBoost is based on GBM, it optimizes the model by assigning weights using a boosting method, reducing the residual error of the model created with classification and regression tree algorithm–based trees.</p>
        </sec>
      </sec>
      <sec>
        <title>Grid Search</title>
        <p>Unlike LR analysis, machine learning algorithms support various parameters (hyperparameters) so that users can optimize the model. Grid search is a technique to find the parameter value when the model has the highest performance by sequentially applying the parameter values set by the user. We optimized the model by applying grid search to the above 3 algorithms (RF, GBM, and XGBoost). Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> presents the parameters and ranges used in the grid search for each algorithm.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Model Results</title>
        <p>Comparing model performance by chronic disease, the predictive model using XGBoost based on accuracy showed superior performance in all diseases compared to the other 3 models (<xref ref-type="table" rid="table3">Table 3</xref>).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Performance comparison of disease prediction models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="490"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <col width="120"/>
            <thead>
              <tr valign="bottom">
                <td colspan="2">Parameter, chronic disease</td>
                <td>LR<sup>a</sup></td>
                <td>RF<sup>b</sup></td>
                <td>GBM<sup>c</sup></td>
                <td>XGBoost<sup>d</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Accuracy</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Type 2 Diabetes</td>
                <td>0.877</td>
                <td>0.8743</td>
                <td>0.8743</td>
                <td>0.8824</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hypertension</td>
                <td>0.7783</td>
                <td>0.793</td>
                <td>0.7896</td>
                <td>0.8213</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hyperlipidemia</td>
                <td>0.8125</td>
                <td>0.82</td>
                <td>0.8325</td>
                <td>0.8325</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cardiovascular disease</td>
                <td>0.7941</td>
                <td>0.8162</td>
                <td>0.8235</td>
                <td>0.8429</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Sensitivity</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Type 2 Diabetes</td>
                <td>0.8852</td>
                <td>0.8804</td>
                <td>0.8684</td>
                <td>0.8705</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hypertension</td>
                <td>0.7758</td>
                <td>0.7758</td>
                <td>0.7783</td>
                <td>0.7934</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hyperlipidemia</td>
                <td>0.8141</td>
                <td>0.8556</td>
                <td>0.8077</td>
                <td>0.8182</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cardiovascular disease</td>
                <td>0.8143</td>
                <td>0.8644</td>
                <td>0.8030</td>
                <td>0.8243</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Specificity</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Type 2 Diabetes</td>
                <td>0.8691</td>
                <td>0.8684</td>
                <td>0.8804</td>
                <td>0.8950</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hypertension</td>
                <td>0.7808</td>
                <td>0.7808</td>
                <td>0.8019</td>
                <td>0.8550</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Hyperlipidemia</td>
                <td>0.8109</td>
                <td>0.8122</td>
                <td>0.8333</td>
                <td>0.8482</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cardiovascular disease</td>
                <td>0.8333</td>
                <td>0.7792</td>
                <td>0.7857</td>
                <td>0.8636</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>LR: logistic regression.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>RF: random forest.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>GBM: gradient boosting model.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>XGBoost: extreme gradient boosting.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Validation Results</title>
        <p><xref ref-type="table" rid="table4">Table 4</xref> shows the parameter values of each disease model outputted by the XGBoost grid search.</p>
        <p>The model evaluation indicators used were accuracy, sensitivity, specificity, and AUC. Over 80% prediction accuracy was achieved for all diseases, with AUC from 0.84 to 0.93. The XGBoost model performance by disease is shown in <xref ref-type="table" rid="table5">Table 5</xref> and <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Extreme gradient boosting grid search result.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="bottom">
                <td>Target disease</td>
                <td>Subsample<sup>a</sup></td>
                <td>Max depth<sup>b</sup></td>
                <td>Min child<sup>c</sup></td>
                <td>Eta<sup>d</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Type 2 diabetes</td>
                <td>0.7</td>
                <td>7</td>
                <td>2</td>
                <td>0.1</td>
              </tr>
              <tr valign="top">
                <td>Hypertension</td>
                <td>0.9</td>
                <td>3</td>
                <td>2</td>
                <td>0.01</td>
              </tr>
              <tr valign="top">
                <td>Hyperlipidemia</td>
                <td>1</td>
                <td>3</td>
                <td>2</td>
                <td>0.01</td>
              </tr>
              <tr valign="top">
                <td>Cardiovascular disease</td>
                <td>0.9</td>
                <td>3</td>
                <td>1</td>
                <td>0.01</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Subsample: sample’s rate of each tree.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>Max depth: maximum depth of Tree.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>Min child: minimum sum of weights for all observations needed in the child.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>Eta: learning rate.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Predictive performance by model.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Target disease</td>
                <td>Accuracy</td>
                <td>Sensitivity</td>
                <td>Specificity</td>
                <td>AUC<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Type 2 diabetes</td>
                <td>0.8824</td>
                <td>0.8705</td>
                <td>0.8950</td>
                <td>0.9303</td>
              </tr>
              <tr valign="top">
                <td>Hypertension</td>
                <td>0.8213</td>
                <td>0.7934</td>
                <td>0.8550</td>
                <td>0.8704</td>
              </tr>
              <tr valign="top">
                <td>Hyperlipidemia</td>
                <td>0.8325</td>
                <td>0.8182</td>
                <td>0.8432</td>
                <td>0.8442</td>
              </tr>
              <tr valign="top">
                <td>Cardiovascular disease</td>
                <td>0.8429</td>
                <td>0.8243</td>
                <td>0.8636</td>
                <td>0.8726</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table5fn1">
              <p><sup>a</sup>AUC: area under the curve.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Receiver operating characteristic curves for XGBoost (A) type 2 diabetes model, (B) hypertension model, (C) hyperlipidemia model, and (D) cardiovascular disease model. AUC: area under the curve; XGBoost: extreme gradient boost.</p>
          </caption>
          <graphic xlink:href="ai_v1i1e41030_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Shapley Additive Explanations Model Variable Importance</title>
        <p>In open-source program languages (eg, Python and R), the XGBoost package shows model feature importance using its own library. However, small models are more combined and complicated, and the feature importance of small models becomes inconsistent. Therefore, we used the Shapley additive explanations (SHAP) method to represent the model’s feature importance, which had high consistency and accuracy [<xref ref-type="bibr" rid="ref15">15</xref>]. SHAP’s feature importance used the weighted average of marginal contribution for each feature (Shapley value). It gave the importance of the features and the positive or negative effect of each feature. The formula of Shapley value is a follows:</p>
        <disp-formula>
          Contribution of feature<sub>i</sub> = β<sub>i</sub>x<sub>i</sub> – E(β<sub>i</sub>x<sub>i</sub>) = β<sub>i</sub>x<sub>i</sub> – β<sub>i</sub>E(x<sub>i</sub>)
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="ai_v1i1e41030_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>where Ø<italic><sub>i</sub></italic> is the Shaley value of data<italic><sub>i</sub></italic>, <italic>F</italic> is the full set, <italic>S</italic> is the subsets in total set excluding data<italic><sub>i</sub></italic>, <inline-graphic xlink:href="ai_v1i1e41030_fig6.png" xlink:type="simple" mimetype="image"/> is the contribution of the full set including data<italic><sub>i</sub></italic>, and <italic>f<sub>S</sub> (x<sub>S</sub>)</italic> is the contribution of subsets excluding data<italic><sub>i</sub></italic>.</p>
        <p>The SHAP value graph of the fitted model for each disease is presented in <xref rid="figure4" ref-type="fig">Figure 4</xref>.</p>
        <p>In the case of type 2 diabetes, fasting blood glucose (Shapley value=1.895), age (1.271), and BMI (0.245) influenced the occurrence of diabetes within 10 years [<xref ref-type="bibr" rid="ref16">16</xref>]. For hypertension, hyperlipidemia (1.272), cardiovascular disease (1.379), and age (1.117) had the greatest influence on disease occurrence. Furthermore, in the case of hyperlipidemia, it was found that among the variables excluding age, total cholesterol (0.616) and low-density lipoprotein (0.249) influenced disease occurrence, in the order presented [<xref ref-type="bibr" rid="ref17">17</xref>]. In case of cardiovascular disease, systolic blood pressure (0.164) and high-density lipoprotein (0.113) had the second highest influence on disease occurrence. These results are consistent with the results of previous studies that studied the risk factors of the 4 chronic diseases [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Shapley additive explanations (SHAP) value graph of the fitted model for the importance of (A) type 2 diabetes variables, (B) hypertension variables, (C) hyperlipidemia variables, and (D) cardiovascular disease variables. ALT: alanine aminotransferase; AST: aspartate aminotransferase; DBP: diastolic blood pressure; HDL: high-density lipoprotein; LDL: low-density lipoprotein; SBP: systolic blood pressure.</p>
          </caption>
          <graphic xlink:href="ai_v1i1e41030_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study develops a disease prediction model with more than 80% accuracy by using the 16 National Health Insurance system test variables from the real-world data of a tertiary hospital in South Korea. Our study:</p>
        <list list-type="order">
          <list-item>
            <p>Presents the possibility of predicting diseases with universal and useful information on public health examinations,</p>
          </list-item>
          <list-item>
            <p>Explains the ability of model prediction results, and</p>
          </list-item>
          <list-item>
            <p>Presents the external verification and scalability using other organizations’ CDM.</p>
          </list-item>
        </list>
        <p>By observing recent research trends relating to disease prediction models using a CDM, the number of cases focusing on multicenter studies rather than single-center studies is increasing. Lee et al [<xref ref-type="bibr" rid="ref18">18</xref>] established an artificial intelligence (AI) learning platform for multicenter clinical research focusing on CDM linkage. Using data from Gachon University Gil Hospital to develop a machine learning model that predicts 5-year risk in patients with inflammatory bowel disease who started biologics, Choi et al [<xref ref-type="bibr" rid="ref19">19</xref>] externally validated the model with CDM data (Ministry of Food and Drug Safety). Johnston et al [<xref ref-type="bibr" rid="ref20">20</xref>] developed a model to predict whether patients will stop taking antihyperglycemic drugs within 1 to 2 years after laparoscopic metabolic surgery. Using psychiatric patient notes at Ajou University Hospital, Lee et al [<xref ref-type="bibr" rid="ref21">21</xref>] developed an NLP model that predicts the onset of psychosis in patients by learning, which is a representative case. As such, if the same cohort criterion is applied to multiple institutions in an expanded form along with disease prediction model construction and cross-validation, a more universal and robust model can be developed.</p>
        <p>Research is being conducted globally to reduce medical costs by predicting disease occurrence using AI. As the AI industry has gone bigger, AI can reduce costs in providing care and increase the efficiency of medical jobs [<xref ref-type="bibr" rid="ref22">22</xref>]. In 2019, researchers from the Boston Institute of Technology and Boston Health Center conducted joint research using electronic health records and lifelog big data with AI in an attempt to prevent disease outbreaks and medical fraud. The findings may help reduce hospitalization costs, which account for a substantial portion of US medical expenses [<xref ref-type="bibr" rid="ref23">23</xref>]. The model developed through this study is expected to evolve into a similar system for South Koreans, by predicting the risk of future disease development and aiding self-health management.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <p>With the recent development of AI and data processing technology, research on disease prediction model development using National Health Insurance Service data [<xref ref-type="bibr" rid="ref9">9</xref>] or single-institution EMR data [<xref ref-type="bibr" rid="ref24">24</xref>] is steadily progressing. In this study, we developed chronic disease prediction models with relatively high performance compared to previous papers. A difference between this study and existing work is that the models have been developed using the CDM, so that we can expect improved precision through variable expansion and by simultaneously using multiorganizational data.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>A limitation of this study is that it uses a single-institution CDM from a tertiary hospital. Therefore, it cannot ensure generalizability. Additionally, the demographic variables (educational level, residential area, marital status, etc.) are insufficient compared to the health insurance service examination items. They are limited due to the focus on South Korean public checkups; by using more features related with the disease (hemoglobin A1C and biopsy data), the model becomes more accurate. Lastly, the model was trained using the cross-sectional data of patients. If the model is trained using time-series data (eg, the cohort of patients’ information that includes changes of laboratory results as time goes by), it could be much more comprehensive.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, 4 metabolic chronic diseases were selected, and disease prediction models were developed using the Ajou University Hospital CDM. To obtain a model suitable for disease prediction, the predictive performance of each model for disease occurrence was compared using the LR, GBM, RF, and XGBoost algorithms. The XGBoost model shows the best performance for all diseases. The performance of the XGBoost model was calculated as 0.9303, 0.8704, 0.8442, and 0.8726 AUC standards for type 2 diabetes, hypertension, hyperlipidemia, and cardiovascular disease, respectively. In addition, the importance of the variables was calculated through modeling, and the results are in line with previous clinical studies. We have confirmed that chronic diseases can be predicted, not just using single-institution EMR or public clinical data, but using the CDM in each local hospital.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Model’s hyperparameter range for grid search.</p>
        <media xlink:href="ai_v1i1e41030_app1.docx" xlink:title="DOCX File , 14 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CDM</term>
          <def>
            <p>common data model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EMR</term>
          <def>
            <p>electronic medical record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">GBM</term>
          <def>
            <p>gradient boosting model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">LR</term>
          <def>
            <p>logistic regression</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">RF</term>
          <def>
            <p>random forest</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SHAP</term>
          <def>
            <p>Shapley additive explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">XGBoost</term>
          <def>
            <p>extreme gradient boosting</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>World Health Organization</collab>
            <collab>Regional Office for Europe</collab>
          </person-group>
          <article-title>Action plan for the prevention and control of noncommunicable diseases in the WHO European Region</article-title>
          <source>World Health Organization</source>
          <year>2016</year>
          <access-date>2022-12-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://apps.who.int/iris/handle/10665/341522">https://apps.who.int/iris/handle/10665/341522</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>YE</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Impact of a primary health care chronic diseases management pilot program. Article in Korean</article-title>
          <source>Korean J Med</source>
          <year>2021</year>
          <month>2</month>
          <day>1</day>
          <volume>96</volume>
          <issue>1</issue>
          <fpage>7</fpage>
          <lpage>12</lpage>
          <pub-id pub-id-type="doi">10.3904/kjm.2021.96.1.7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>EK</given-names>
            </name>
          </person-group>
          <article-title>2020 chronic disease fact sheet. Article in Korean</article-title>
          <source>Korean Disease Control and Prevention Agency</source>
          <year>2021</year>
          <month>01</month>
          <day>26</day>
          <access-date>2022-12-14</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.kdca.go.kr/gallery.es?mid=a20503020000&#38;bid=0003&#38;act=view&#38;list_no=144928">https://www.kdca.go.kr/gallery.es?mid=a20503020000&#38;bid=0003&#38;act=view&#38;list_no=144928</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Son</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Seo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Rho</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Koo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yoo</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Moon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>HY</given-names>
            </name>
            <name name-style="western">
              <surname>Yun</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Mok</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>KH</given-names>
            </name>
          </person-group>
          <article-title>Diabetes fact sheets in Korea, 2020: an appraisal of current status</article-title>
          <source>Diabetes Metab J</source>
          <year>2021</year>
          <month>01</month>
          <day>13</day>
          <volume>45</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33434426"/>
          </comment>
          <pub-id pub-id-type="doi">10.4093/dmj.2020.0254</pub-id>
          <pub-id pub-id-type="medline">33434426</pub-id>
          <pub-id pub-id-type="pii">dmj.2020.0254</pub-id>
          <pub-id pub-id-type="pmcid">PMC7850879</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>TR</given-names>
            </name>
          </person-group>
          <article-title>Development of T2DM prediction model using RNN. Article in Korean</article-title>
          <source>Journal of Digital Convergence</source>
          <year>2019</year>
          <month>08</month>
          <day>28</day>
          <volume>17</volume>
          <issue>8</issue>
          <fpage>249</fpage>
          <lpage>255</lpage>
          <pub-id pub-id-type="doi">10.14400/JDC.2019.17.8.249</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Koh</surname>
              <given-names>SB</given-names>
            </name>
          </person-group>
          <article-title>Development and validation of prediction model for risk reduction of metabolic syndrome by body weight control: a prospective population-based study</article-title>
          <source>Sci Rep</source>
          <year>2020</year>
          <month>06</month>
          <day>19</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>10006</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-020-67238-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-020-67238-5</pub-id>
          <pub-id pub-id-type="medline">32561810</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-020-67238-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC7305222</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bahadori</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Schuetz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Doctor AI: Predicting Clinical Events via Recurrent Neural Networks</article-title>
          <source>JMLR Workshop Conf Proc</source>
          <year>2016</year>
          <month>08</month>
          <volume>56</volume>
          <fpage>301</fpage>
          <lpage>318</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28286600"/>
          </comment>
          <pub-id pub-id-type="medline">28286600</pub-id>
          <pub-id pub-id-type="pmcid">PMC5341604</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lipton</surname>
              <given-names>ZC</given-names>
            </name>
            <name name-style="western">
              <surname>Kale</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Elkan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wetzel</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Learning to diagnose with LSTM recurrent neural networks</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on November 11, 2015</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1511.03677</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JO(</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Machine learning-based cardiovascular disease prediction model: a cohort study on the Korean National Health Insurance Service Health Screening Database</article-title>
          <source>Diagnostics (Basel)</source>
          <year>2021</year>
          <month>05</month>
          <day>25</day>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>943</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=diagnostics11060943"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/diagnostics11060943</pub-id>
          <pub-id pub-id-type="medline">34070504</pub-id>
          <pub-id pub-id-type="pii">diagnostics11060943</pub-id>
          <pub-id pub-id-type="pmcid">PMC8229422</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>DR</given-names>
            </name>
          </person-group>
          <article-title>Regression models and life-tables</article-title>
          <source>Journal of the Royal Statistical Society: Series B (Methodological)</source>
          <year>2018</year>
          <month>12</month>
          <day>05</day>
          <volume>34</volume>
          <issue>2</issue>
          <fpage>187</fpage>
          <lpage>202</lpage>
          <pub-id pub-id-type="doi">10.1111/j.2517-6161.1972.tb00899.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Louppe</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Understanding random forests: from theory to practice</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on July 28, 2014</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1407.7502</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bühlmann</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hothorn</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Boosting algorithms: regularization, prediction and model fitting</article-title>
          <source>Statist Sci</source>
          <year>2007</year>
          <month>11</month>
          <day>1</day>
          <volume>22</volume>
          <issue>4</issue>
          <fpage>477</fpage>
          <lpage>505</lpage>
          <pub-id pub-id-type="doi">10.1214/07-STS242</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Predicting student performance in online learning using a highly efficient gradient boosting decision tree</article-title>
          <year>2022</year>
          <conf-name>IIP 2022: Intelligent Information Processing XI</conf-name>
          <conf-date>May 27-30, 2022</conf-date>
          <conf-loc>Qingdao, China</conf-loc>
          <fpage>508</fpage>
          <lpage>521</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-031-03948-5_41</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Guestrin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>XGBoost: a scalable tree boosting system</article-title>
          <year>2016</year>
          <month>08</month>
          <day>13</day>
          <conf-name>KDD '16: 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>August 13-17, 2016</conf-date>
          <conf-loc>San Francisco, CA</conf-loc>
          <fpage>785</fpage>
          <lpage>794</lpage>
          <pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>SI</given-names>
            </name>
          </person-group>
          <article-title>A unified approach to interpreting model predictions</article-title>
          <year>2017</year>
          <conf-name>NIPS 2017: 31st Conference on Neural Information Processing Systems</conf-name>
          <conf-date>December 4-9, 2017</conf-date>
          <conf-loc>Long Beach, CA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper/2017/hash/8a20a8621978632d76c43dfd28b67767-Abstract.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Prevalence and risk factors of type 2 diabetes according to gender among Korean employees. Article in Korean</article-title>
          <source>Journal of the Korea Academia-Industrial cooperation Society</source>
          <year>2015</year>
          <month>11</month>
          <day>30</day>
          <volume>16</volume>
          <issue>11</issue>
          <fpage>7589</fpage>
          <lpage>7598</lpage>
          <pub-id pub-id-type="doi">10.5762/KAIS.2015.16.11.7589</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>KY</given-names>
            </name>
          </person-group>
          <article-title>Risk factors for hypertension in elderly people aged 65 and over, and adults under age 65. Article in Korean</article-title>
          <source>Journal of the Korea Academia-Industrial cooperation Society</source>
          <year>2019</year>
          <month>01</month>
          <day>31</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>162</fpage>
          <lpage>69</lpage>
          <pub-id pub-id-type="doi">10.5762/KAIS.2019.20.1.162</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>No</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>CW</given-names>
            </name>
          </person-group>
          <article-title>Construction of artificial intelligence training platform for multi-center clinical research. Article in Korean</article-title>
          <source>KIPS Transactions on Computer and Communication Systems</source>
          <year>2020</year>
          <month>10</month>
          <day>31</day>
          <volume>9</volume>
          <issue>10</issue>
          <fpage>239</fpage>
          <lpage>246</lpage>
          <pub-id pub-id-type="doi">10.3745/KTCCS.2020.9.10.239</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>YI</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>KO</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>KG</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>YJ</given-names>
            </name>
          </person-group>
          <article-title>Development of machine learning model to predict the 5-year risk of starting biologic agents in patients with inflammatory bowel disease (IBD): K-CDM network study</article-title>
          <source>J Clin Med</source>
          <year>2020</year>
          <month>10</month>
          <day>26</day>
          <volume>9</volume>
          <issue>11</issue>
          <fpage>3427</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=jcm9113427"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/jcm9113427</pub-id>
          <pub-id pub-id-type="medline">33114505</pub-id>
          <pub-id pub-id-type="pii">jcm9113427</pub-id>
          <pub-id pub-id-type="pmcid">PMC7693158</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnston</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Morton</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Kalsekar</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Ammann</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Hsiao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Reps</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Using machine learning applied to real-world healthcare data for predictive analytics: an applied example in bariatric surgery</article-title>
          <source>Value Health</source>
          <year>2019</year>
          <month>05</month>
          <volume>22</volume>
          <issue>5</issue>
          <fpage>580</fpage>
          <lpage>586</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1098-3015(19)30073-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jval.2019.01.011</pub-id>
          <pub-id pub-id-type="medline">31104738</pub-id>
          <pub-id pub-id-type="pii">S1098-3015(19)30073-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>DY</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Son</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>YH</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>RW</given-names>
            </name>
          </person-group>
          <article-title>Psychosis relapse prediction leveraging electronic health records data and natural language processing enrichment methods</article-title>
          <source>Front Psychiatry</source>
          <year>2022</year>
          <month>4</month>
          <day>5</day>
          <volume>13</volume>
          <fpage>844442</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35479497"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fpsyt.2022.844442</pub-id>
          <pub-id pub-id-type="medline">35479497</pub-id>
          <pub-id pub-id-type="pmcid">PMC9037331</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davenport</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kalakota</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The potential for artificial intelligence in healthcare</article-title>
          <source>Future Healthc J</source>
          <year>2019</year>
          <month>06</month>
          <day>13</day>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>94</fpage>
          <lpage>98</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31363513"/>
          </comment>
          <pub-id pub-id-type="doi">10.7861/futurehosp.6-2-94</pub-id>
          <pub-id pub-id-type="medline">31363513</pub-id>
          <pub-id pub-id-type="pii">futurehealth</pub-id>
          <pub-id pub-id-type="pmcid">PMC6616181</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raven</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Doran</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Kostrowski</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gillespie</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Elbel</surname>
              <given-names>BD</given-names>
            </name>
          </person-group>
          <article-title>An intervention to improve care and reduce costs for high-risk patients with frequent hospital admissions: a pilot study</article-title>
          <source>BMC Health Serv Res</source>
          <year>2011</year>
          <month>10</month>
          <day>13</day>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>270</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmchealthservres.biomedcentral.com/articles/10.1186/1472-6963-11-270"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6963-11-270</pub-id>
          <pub-id pub-id-type="medline">21995329</pub-id>
          <pub-id pub-id-type="pii">1472-6963-11-270</pub-id>
          <pub-id pub-id-type="pmcid">PMC3212942</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Twick</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zahavi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Benvenisti</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Rubinstein</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Woods</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Berkenstadt</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Nissan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hosgor</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Assaf</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Towards interpretable, medically grounded, EMR-based risk prediction models</article-title>
          <source>Sci Rep</source>
          <year>2022</year>
          <month>06</month>
          <day>15</day>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>9990</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-022-13504-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-022-13504-7</pub-id>
          <pub-id pub-id-type="medline">35705550</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-022-13504-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC9200841</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
