<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v2i1e42884</article-id>
      <article-id pub-id-type="pmid">38875556</article-id>
      <article-id pub-id-type="doi">10.2196/42884</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Extraction of Radiological Characteristics From Free-Text Imaging Reports Using Natural Language Processing Among Patients With Ischemic and Hemorrhagic Stroke: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Malin</surname>
            <given-names>Bradley</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kapsetaki</surname>
            <given-names>Marianna</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Balogun</surname>
            <given-names>Oluwafemi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Klement</surname>
            <given-names>William</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Hsu</surname>
            <given-names>Enshuo</given-names>
          </name>
          <degrees>BSc, MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8137-0171</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Bako</surname>
            <given-names>Abdulaziz T</given-names>
          </name>
          <degrees>MBBS, MPH, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1584-8114</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Potter</surname>
            <given-names>Thomas</given-names>
          </name>
          <degrees>BSc, MSc, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7884-4172</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Pan</surname>
            <given-names>Alan P</given-names>
          </name>
          <degrees>BSc, MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8782-8024</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Britz</surname>
            <given-names>Gavin W</given-names>
          </name>
          <degrees>MBBCHIR, MPH, MBA</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2358-3005</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Tannous</surname>
            <given-names>Jonika</given-names>
          </name>
          <degrees>BA, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4022-9267</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Vahidy</surname>
            <given-names>Farhaan S</given-names>
          </name>
          <degrees>MBBS, MPH, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Center for Health Data Science and Analytics</institution>
            <institution>Houston Methodist Research Institute</institution>
            <addr-line>7550 Greenbriar Drive</addr-line>
            <addr-line>Houston, TX, 77030</addr-line>
            <country>United States</country>
            <phone>1 346 356 1479</phone>
            <email>fvahidy@houstonmethodist.org</email>
          </address>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3464-2111</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Center for Health Data Science and Analytics</institution>
        <institution>Houston Methodist Research Institute</institution>
        <addr-line>Houston, TX</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>School of Biomedical Informatics</institution>
        <institution>University of Texas Health Science Center at Houston</institution>
        <addr-line>Houston, TX</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Neurosurgery</institution>
        <institution>Houston Methodist Neurological Institute</institution>
        <addr-line>Houston, TX</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Neurology</institution>
        <institution>Weill Cornell Medical College</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Population Health Sciences</institution>
        <institution>Weill Cornell Medical College</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Farhaan S Vahidy <email>fvahidy@houstonmethodist.org</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>6</day>
        <month>6</month>
        <year>2023</year>
      </pub-date>
      <volume>2</volume>
      <elocation-id>e42884</elocation-id>
      <history>
        <date date-type="received">
          <day>22</day>
          <month>9</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>11</day>
          <month>11</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>10</day>
          <month>1</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>8</day>
          <month>4</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Enshuo Hsu, Abdulaziz T Bako, Thomas Potter, Alan P Pan, Gavin W Britz, Jonika Tannous, Farhaan S Vahidy. Originally published in JMIR AI (https://ai.jmir.org), 06.06.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2023/1/e42884" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Neuroimaging is the gold-standard diagnostic modality for all patients suspected of stroke. However, the unstructured nature of imaging reports remains a major challenge to extracting useful information from electronic health records systems. Despite the increasing adoption of natural language processing (NLP) for radiology reports, information extraction for many stroke imaging features has not been systematically evaluated.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this study, we propose an NLP pipeline, which adopts the state-of-the-art ClinicalBERT model with domain-specific pretraining and task-oriented fine-tuning to extract 13 stroke features from head computed tomography imaging notes.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used the model to generate structured data sets with information on the presence or absence of common stroke features for 24,924 patients with strokes. We compared the survival characteristics of patients with and without features of severe stroke (eg, midline shift, perihematomal edema, or mass effect) using the Kaplan-Meier curve and log-rank tests.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Pretrained on 82,073 head computed tomography notes with 13.7 million words and fine-tuned on 200 annotated notes, our HeadCT_BERT model achieved an average area under receiver operating characteristic curve of 0.9831, <italic>F</italic><sub>1</sub>-score of 0.8683, and accuracy of 97%. Among patients with acute ischemic stroke, admissions with any severe stroke feature in initial imaging notes were associated with a lower probability of survival (<italic>P</italic>&#60;.001).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our proposed NLP pipeline achieved high performance and has the potential to improve medical research and patient safety.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>deep learning</kwd>
        <kwd>electronic health records</kwd>
        <kwd>ischemic stroke</kwd>
        <kwd>cerebral hemorrhage</kwd>
        <kwd>neuroimaging</kwd>
        <kwd>computed tomography</kwd>
        <kwd>stroke</kwd>
        <kwd>radiology</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Overview</title>
        <p>Computed tomography (CT) and magnetic resonance imaging (MRI) are the gold standards for assessing and triaging patients with suspected strokes. However, free-text imaging reports containing important radiological findings are embedded in electronic health records (EHRs) systems in an unstructured narrative format, precluding data encoding [<xref ref-type="bibr" rid="ref1">1</xref>] to enable clinical decisions and support research applications [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. Fortunately, the limitations of unstructured data have been mitigated by recent advancements in information extraction and processing methods, such as natural language processing (NLP).</p>
        <p>Traditional rule-based NLP algorithms that use handcrafted dictionaries, keywords, and decision rules to analyze the structure of the language have classically been adopted for analyses of textual data [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. However, the creation and maintenance of decision rules are labor-intensive tasks, and the quality of rules significantly influences model performance. In recent years, data-driven methods, including machine learning and deep learning, have been developed. Machine learning approaches use derived features (eg, term frequency and n-gram) from text to train supervised-learning models (eg, support vector machine [SVM] or random forest) and predict desirable outputs on new documents [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Deep learning methods often involve more sophisticated architectures (eg, recurrent neural networks, convolutional neural networks, and self-attention) and use word embeddings to account for the sequence and context of natural language [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p>
        <p>The Bidirectional Encoder Representations from Transformers (BERT) NLP model, which uses a 24-layered deep learning architecture, was published in 2018 and achieved state-of-the-art performance on NLP benchmarks [<xref ref-type="bibr" rid="ref12">12</xref>]. A clinical version, ClinicalBERT, was later developed by pretraining the BERT model on EHR notes to achieve improved performance on clinical data [<xref ref-type="bibr" rid="ref13">13</xref>]. Furthermore, the ClinicalBERT model has also been trained and validated for the extraction of radiological features from chest and bone x-ray notes [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>In the context of cerebrovascular disease and stroke, NLP has been applied to classify various stroke phenotypes [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] and perform feature extraction [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Despite these emerging applications, optimal use of NLP pipelines for stroke research is yet to be achieved. More specifically, limited studies have used BERT to extract important neuroimaging findings, such as midline shift [<xref ref-type="bibr" rid="ref16">16</xref>] and mass effect [<xref ref-type="bibr" rid="ref17">17</xref>]. Therefore, the use of NLP-based extraction of many critically important neuroimaging features has not been systematically implemented. We evaluated a deep learning–based NLP model (HeadCT_BERT) that is built upon ClinicalBERT and fine-tuned for the extraction and structured data generation of 13 critical stroke neuroimaging features.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <sec>
          <title>NLP on Stroke Imaging Notes</title>
          <p>NLP has been adopted to automate stroke acuity classification. Li et al [<xref ref-type="bibr" rid="ref8">8</xref>] used head CT and MRI radiology reports to train a random forest model for ischemic stroke acuity classification. Kim et al [<xref ref-type="bibr" rid="ref9">9</xref>] evaluated logistic regression, naïve Bayesian, decision tree, and SVM models to identify ischemic stroke from MRI reports. In addition, Garg et al [<xref ref-type="bibr" rid="ref3">3</xref>] trained a variety of machine learning algorithms (ie, k-nearest neighbors, SVM, random forest, extra trees classifier, and XGBoost) to identify ischemic stroke subtypes from neurology progress notes and neuroradiology reports. In addition to NLP-based classification algorithms, a few studies adopted NLP for stroke imaging feature extraction. Yu et al [<xref ref-type="bibr" rid="ref5">5</xref>] used a rule-based NLP tool, CHARTextract, to extract the type of occlusion, presence of established ischemia, and hemorrhage from CT reports. Gordon et al [<xref ref-type="bibr" rid="ref17">17</xref>] proposed a machine learning–based method using XGBoost to extract the intracranial mass effect. However, there are several untapped avenues for the applications of state-of-the-art NLP methods in the stroke and cerebrovascular disease domain.</p>
        </sec>
        <sec>
          <title>Fine-Tuning BERT for Medical Imaging Findings Extraction</title>
          <p>The most common application of BERT is to fine-tune the out-of-box network for the NLP task. Olthof et al [<xref ref-type="bibr" rid="ref18">18</xref>] fine-tuned the BERT model with 3268 labeled radiology reports of injured extremities and chest radiographs for extracting the presence of injury. The BERT network was appended with a binary classifier layer and trained (“fine-tuned”) with the labeled reports. The authors reported that BERT outperformed rule-based classifiers and machine learning classifiers and achieved an <italic>F</italic><sub>1</sub>-score of 0.95 and an area under receiver operating characteristic curve (AUROC) of 0.99. Fink et al [<xref ref-type="bibr" rid="ref19">19</xref>] fine-tuned the German-language BERT with structured oncology reports for rapid tumor response category classification. The results showed that the BERT model (<italic>F</italic><sub>1</sub>=0.70) achieved a similar performance as that of medical students (<italic>F</italic><sub>1</sub>≈0.73), although it was inferior to radiologists’ performance (<italic>F</italic><sub>1</sub>=0.79).</p>
        </sec>
        <sec>
          <title>Pretraining and Fine-Tuning BERT for Medical Imaging Findings Extraction</title>
          <p>Pretraining BERT with domain-specific text is an additional step that may boost model performance in subsequent fine-tuning. Smit et al [<xref ref-type="bibr" rid="ref14">14</xref>] used an automatic labeling algorithm to tag 200,000 radiology reports for pretraining. After pretraining, 1000 reports were randomly sampled and annotated by radiologists for fine-tuning. The final NLP model, CheXbert, achieved state-of-the-art performance on one of the largest chest x-ray data sets, MIMIC-CXR, with an <italic>F</italic><sub>1</sub>-score of 0.798, which is close to radiologists’ performances (<italic>F</italic><sub>1</sub>=0.805). Dai et al [<xref ref-type="bibr" rid="ref15">15</xref>] took a similar approach using x-ray radiology reports for bone fracture. The authors developed a rule-based automatic labeling algorithm to label 6048 reports for model pretraining. Subsequently, the model was fine-tuned with a subset of 4890 manually annotated reports for fracture status detection (ie, positive, negative, or uncertain) and fracture type, bone type, and location extraction. To our knowledge, BERT pretraining in the biomedical field is underused and has not been attempted within the cerebrovascular disease domain.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Source and Variables</title>
        <p>Registry for Neurological Endpoint Assessments among Patients with Ischemic and Hemorrhagic Stroke (REINAH) [<xref ref-type="bibr" rid="ref20">20</xref>] is a data warehouse built upon the EHR at Houston Methodist, a tertiary health care system serving the greater Houston metropolitan area. REINAH hosts data for over 45,000 patients with cerebrovascular disease, representing over 982,000 neuroimaging records obtained between September 2007 and August 2022. From REINAH, we queried records that (1) had final results available before data collection on July 19, 2021; (2) had an imaging type of “CT head without contrast”; and (3) had attached imaging notes. All imaging notes were written in short paragraphs and stored as plain text. The age, sex, race, ethnicity, BMI, insurance type, stroke type, and National Institutes of Health Stroke Scale scores were extracted from each patient’s initial stroke encounter.</p>
      </sec>
      <sec>
        <title>Ethics Approval</title>
        <p>This study was approved by the Houston Methodist Institutional Review Board (PRO00025034).</p>
      </sec>
      <sec>
        <title>Annotation</title>
        <p>We identified 20 clinically relevant stroke-related features to extract, including hemorrhage volume, midline shift, herniation, perihematomal edema, white matter hyperintensity, intracerebral hemorrhage (ICH) location, lacunes, old stroke, remote stroke, subacute infarct, cerebral atrophy, intraventricular hemorrhage, acute ischemia, subdural hematoma, subarachnoid hemorrhage, extra-axial hemorrhage, encephalomalacia, mass effect, and location for any non-ICH lesion (finding location). Each imaging note could include none, one, or multiple concepts. As illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>, we randomly sampled 400 notes for model fine-tuning and evaluation and adopted the Begin-Inside-Outside method [<xref ref-type="bibr" rid="ref21">21</xref>], which tags the starting position and end position of predetermined imaging features of interest in the text. We then randomly partitioned the 400 samples into the following three data sets: (1) a communication set containing 50 notes; (2) a reviewer-agreement set with 50 notes; and (3) two independent-review sets, each containing 150 notes. Two clinically trained reviewers in neuroimaging (ATB and TP) then manually annotated the imaging notes in 3 sequential stages. In the first stage, the communication set was annotated collaboratively by the 2 reviewers. In the second stage, reviewers performed separate annotations of the reviewer-agreement set, and Kappa statistics and percent agreement were evaluated. Inconsistent annotations were discussed to reach a consensus. Finally, independent review sets were separately annotated. Stroke imaging features that were identified in less than 20 notes were excluded from modeling.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Methodology flowchart. We used unannotated computed tomography (CT) imaging notes to pretrain the natural language processing (NLP) model and used a subset of annotated imaging notes to fine-tune and evaluate it. BERT: bidirectional encoder representations from transformers; REINAH: Registry for Neurological Endpoint Assessments among Patients with Ischemic and Hemorrhagic Stroke.</p>
          </caption>
          <graphic xlink:href="ai_v2i1e42884_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Text Processing</title>
        <p>Before a sequence of human language can be processed by NLP models, the text often goes through processes of segmentation, tokenization, and word embedding [<xref ref-type="bibr" rid="ref22">22</xref>]. To segment notes, we first fixed a segment length of 32 words and a step size of 10 words. For each note, the first 32 words were taken as a segment, which was then shifted to the right by 1 step (10 words) to isolate the next segment of 32 words. This process was repeated until the end of the note was reached, thereby transforming a single long note into multiple short, overlapping, text segments. For each segment, word tokenization, which transforms sentences and phrases into individual word-tokens, was performed using the WordPiece [<xref ref-type="bibr" rid="ref23">23</xref>] algorithm implemented in the Python Transformers module (version 4.10.0) and based on a predefined dictionary. In-dictionary words with predetermined tokens (eg, “stroke” and “patient”) were mapped to respective numeric IDs (word embedding). Conversely, out-of-dictionary words (eg, “edema” and “hemorrhage”) were split into multiple in-dictionary tokens and mapped to multiple token IDs (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Examples of text segmentation and word embedding<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="240"/>
            <col width="370"/>
            <col width="390"/>
            <thead>
              <tr valign="top">
                <td>Input word</td>
                <td>Word-token(s)</td>
                <td>Word embedding ID(s)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>stroke</td>
                <td>stroke</td>
                <td>6625</td>
              </tr>
              <tr valign="top">
                <td>patient</td>
                <td>patient</td>
                <td>5351</td>
              </tr>
              <tr valign="top">
                <td>edema</td>
                <td>(ed, ##ema)</td>
                <td>(5048, 14494)</td>
              </tr>
              <tr valign="top">
                <td>hemorrhage</td>
                <td>(hem, ##or, ##r, ##hage)</td>
                <td>(23123, 1766, 1197, 19911)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>The WordPiece algorithm takes each word as input. If a word matches a predefined word-token, embedding is done by assigning a token ID to the word. If a word does not match any predefined token, the word is split into multiple fractions and matched with predefined tokens.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Deep Learning NLP Models</title>
        <p>Our NLP model training involved two phases, as follows: (1) an optional general training phase (“pretraining”) that familiarized the model with clinical terminology in head CT notes, and (2) a required task-specific training phase (“fine-tuning”), where the model learned to identify the 13 remaining stroke features (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <sec>
          <title>Pretraining</title>
          <p>Though NLP models can be trained with solely fine-tuning, recent studies have reported an improved performance after general [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref24">24</xref>] and domain-specific [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref25">25</xref>] pretraining. We used the ClinicalBERT model, which has been pretrained on general English corpora and EHR narratives [<xref ref-type="bibr" rid="ref13">13</xref>]. We hypothesized that further pretraining it with our head CT notes using masked language model (MLM) [<xref ref-type="bibr" rid="ref12">12</xref>] would boost the performance for stroke feature extraction. Details of NLP model pretraining are provided in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. MLM used a “self-supervised” algorithm that generated labels without human annotation. A note was first tokenized into a sequence of word-tokens, and 15% of the tokens were randomly selected. Among each selected token, there was an 80% probability it would be masked (replaced by a “[MASK]” token), a 10% probability it would be replaced by a random token, and a 10% probability it remains unchanged. The MLM pretraining trained the NLP model to do “cloze,” that is, input a sequence of word-tokens with masked tokens and predict the masked tokens using the context. It is hypothesized that through learning the cloze task, the NLP model can generalize this knowledge to improve the performance of other NLP tasks. We continuously pretrained the ClinicalBERT model with 74.0k head CT imaging notes from 2007 to 2020, including a total of 13.7 million words for 5 rounds (“epochs”), and used stand-alone 8.2k notes from January to July 2021 for MLM evaluation (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This pretraining process produced a BERT model, which we labeled “HeadCT_BERT,” that is specific to the head CT imaging domain and can be further fine-tuned for downstream NLP tasks.</p>
        </sec>
        <sec>
          <title>Fine-Tuning</title>
          <p>To train the HeadCT_BERT for stroke features extraction, our downstream task in this study, we fine-tuned it with a development set of 200 notes annotated with stroke features. The HeadCT_BERT was appended with a feedforward layer with sigmoid activation function (“classification layer”) for the stroke feature classification. For each input segment (coded as a sequence of word-tokens with a maximum length of 64), the network outputs an array of probabilities (one probability for each stroke feature). The entire network (HeadCT_BERT + classification layer) was trained simultaneously. To prevent the model from becoming too attuned to the details of the development set, and consequently losing flexibility for new data (ie, to avoid overfitting), the development set was divided into a training set (80% of the notes) and a validation set (the remaining 20% of notes) [<xref ref-type="bibr" rid="ref26">26</xref>]. Model weights were saved as checkpoints after each epoch, and optimal checkpoint weights were selected during validation as our final NLP model. The same fine-tuning process was also performed on the out-of-box ClinicalBERT model for comparison. The deep learning model was implemented using Python 3.9.6, PyTorch 1.9.0, and Transformers 4.10.0. Model computations were performed on an NVIDIA RTX 5000 graphics processing unit.</p>
        </sec>
      </sec>
      <sec>
        <title>Prediction and Evaluation</title>
        <p>The NLP model predicts the probabilities of stroke features in each segment. We aggregated the prediction to note level by selecting the maximum probability of each stroke feature among segments. The final prediction for each note consists of a probability per stroke feature (multilabel classification). We considered stroke features with a probability &#62;.5 as presence.</p>
        <p>To evaluate our NLP model performance, we used a stand-alone evaluation set of 200 annotated imaging notes. Evaluation metrics included recall (sensitivity), specificity, precision (positive predictive value), and <italic>F</italic><sub>1</sub>-score (the harmonic mean of precision and recall). <italic>F</italic><sub>1</sub>-score ranges from 0 to 1, with 1 implying perfect model performance, AUROC curve, and accuracy. We also calculated predicted probabilities and fraction of stroke features and presented probability calibration curves (reliability diagrams).</p>
        <p>
          <disp-formula>
            <graphic xlink:href="ai_v2i1e42884_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </p>
      </sec>
      <sec>
        <title>Sensitivity Analysis</title>
        <p>One challenge for NLP modeling is the need for a large amount of human annotation, which is time consuming and labor intensive. To explore the relationship between the number of annotated training notes and model performance, and potentially reduce the annotation workload, we performed a sensitivity analysis that compared NLP models that were fine-tuned with different development set sizes: 25, 50, 100, and 150 notes. Each subset was split into a training set (80%) and a validation set (20%) and was evaluated on the set of 200 notes.</p>
      </sec>
      <sec>
        <title>Structured Data Generation</title>
        <p>Upon achieving satisfactory evaluation, we ran the model on all head CT imaging notes to automatically generate a structured data set of stroke imaging features. Each feature was represented as a binary variable (yes/no) associated with an imaging note. We further performed survival analysis with the Kaplan-Meier curves to evaluate the association between having any of the severe stroke features (eg, midline shift, perihematomal edema, and mass effect), as captured by NLP, and mortality for patients with acute ischemic stroke (AIS) and ICH. Differences in survival curves were compared using log-rank tests. We calculated survival rates and median survival days.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Of the 982,536 available images in REINAH, we identified 82,073 head CT imaging notes representing 24,924 unique patients, of whom, 13,439 (53.9%) were female, 14,028 (56.3%) were non-Hispanic White, and 15,121 (60.7%) were Medicare beneficiaries, with an overall median age of 69 (IQR 58.5-78.3) years. With regard to stroke subtypes (at the initial encounter), 12,623 (54.4%) of patients had AIS diagnosis, 1307 (5.6%) had subarachnoid hemorrhage (SAH), 7084 (30.5%) had a transient ischemic attack (TIA), and 2208 (9.5%) had ICH. For patients with AIS, the median National Institutes of Health Stroke Scale within 6 and 12 hours of admission was 3.0 (IQR 1.0-7.0), whereas it was 7.0 (IQR 2.0, 19.0) for patients with ICH. The 400 randomly sampled notes represented 398 unique patients. Their sociodemographic characteristics were consistent with the overall population of patients with head CT images. However, a greater proportion of sampled (vs full cohort) patients had a subarachnoid hemorrhage or an ICH, perhaps owing to head CT being a gold standard for evaluation of ICH. Although median BMI was not significantly different in the annotation sample (vs full cohort), the full cohort had a significantly higher proportion of missing BMI information (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
      <p>After annotation, stroke imaging features, including hemorrhage volume, herniation, ICH location, location of other relevant findings, remote stroke, subdural hematoma, and extra-axial hemorrhage, were excluded from modeling due to low frequencies (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The interreviewer agreement analysis showed an excellent agreement between the 2 annotators (0.85 % average Kappa and 97.1% agreement).</p>
      <p>Our fine-tuned HeadCT_BERT model had an AUROC of 0.9831 and an <italic>F</italic><sub>1</sub>-score of 0.8683. The <italic>F</italic><sub>1</sub>-scores were greater than 0.9 for 8 of 13 (61.5%) stroke imaging features, and the AUROCs were greater than 0.96 for all features except for acute ischemia. Results show that after fine-tuning, both ClinicalBERT and HeadCT_BERT achieved favorable performances, while HeadCT_BERT demonstrated marginally better performance (<xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="table" rid="table4">Table 4</xref>; Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
      <p>The sensitivity analysis revealed sigmoid shapes for both models, indicating that improvement in model performance wanes as sample size approaches an optimal point. Specifically, we found marked performance improvements when increasing the training sample size from 25 to 50 and 100 notes. From 100 to 150, however, performance gain decreases, and from 150 to 200 notes, the performance gain is minimal, indicating that the NLP models had achieved near-optimal performance (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
      <p>The probability calibration curves showed HeadCT_BERT is well calibrated for some stroke features (eg, midline shift, white matter hyperintensity, subacute infarct, acute ischemia, subarachnoid hemorrhage, and encephalomalacia), while ClinicalBERT is well calibrated for midline shift, white matter hyperintensity, old stroke, subacute infarct, cerebral atrophy, acute ischemia, ICH, encephalomalacia, and mass effect (Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
      <p>Running on a single–graphics processing unit server, our final NLP model processed ~230 imaging notes per minute and automatically generated a structured stroke imaging feature data set from 24,924 patients with head CT notes across the hospital system. In the resulting data set, 3826 (15.4%) of patients had a mass effect, 3600 (14.4%) had perihematomal edema, 1908 (7.7%) had a midline shift, and 5146 (20.6%) had 1 or more than 1 severe stroke features (eg, midline shift, mass effect, or perihematomal edema; <xref ref-type="table" rid="table5">Table 5</xref>).</p>
      <p>Survival analysis based on the initial head CT notes of 6463 AIS and 1243 ICH emergency admissions showed that patients with severe stroke features had higher mortality and shorter survival times (AIS: 18.4% mortality rate and 585 days median survival time; ICH: 20.7% mortality rate and 572 days median survival time) compared to other patients (AIS: 10.1% mortality rate and 759 days median survival time; ICH: 17.8% mortality rate and 638 days median survival time). Differences in survival probability over time are shown as Kaplan-Meier curves. Among AIS admissions, patients with severe stroke features had significantly lower survival probabilities (<italic>P</italic>&#60;.001; <xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Patient characteristics (average age and BMI are reported at imaging encounters). Italicized <italic>P</italic> values are significant.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="30"/>
          <col width="310"/>
          <col width="0"/>
          <col width="250"/>
          <col width="0"/>
          <col width="240"/>
          <col width="0"/>
          <col width="0"/>
          <col width="140"/>
          <thead>
            <tr valign="bottom">
              <td colspan="4">Characteristics</td>
              <td colspan="2">Head CT<sup>a</sup> population</td>
              <td colspan="2">Annotation sample</td>
              <td colspan="2"><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="4">Imaging notes, N</td>
              <td colspan="2">82,073</td>
              <td colspan="2">400</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="4">Unique patients, N</td>
              <td colspan="2">24,924</td>
              <td colspan="2">398</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="4">Age (years), median (Q1, Q3)</td>
              <td colspan="2">69.0 (58.5, 78.3)</td>
              <td colspan="2">68.0 (56.4, 78.1)</td>
              <td colspan="2">.22</td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>Age (years), n (%)</bold>
              </td>
              <td>.41</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">0-49</td>
              <td colspan="2">3025 (12.1)</td>
              <td colspan="2">57 (14.3)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">50-59</td>
              <td colspan="2">3793 (15.2)</td>
              <td colspan="2">61 (15.3)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">60-69</td>
              <td colspan="2">6149 (24.7)</td>
              <td colspan="2">103 (25.9)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">≥70</td>
              <td colspan="2">11,957 (48)</td>
              <td colspan="2">177 (44.5)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>Gender, n (%)</bold>
              </td>
              <td>.69</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Female</td>
              <td colspan="2">13,439 (53.9)</td>
              <td colspan="2">219 (55)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Male</td>
              <td colspan="2">11,485 (46.1)</td>
              <td colspan="2">179 (45)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>Race or ethnicity, n (%)</bold>
              </td>
              <td>.22</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Non-Hispanic White</td>
              <td colspan="2">14,028 (56.3)</td>
              <td colspan="2">206 (51.8)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Black</td>
              <td colspan="2">5690 (22.8)</td>
              <td colspan="2">102 (25.6)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Hispanic</td>
              <td colspan="2">3412 (13.7)</td>
              <td colspan="2">61 (15.3)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Asian</td>
              <td colspan="2">1209 (4.9)</td>
              <td colspan="2">16 (4)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Other or unknown</td>
              <td colspan="2">585 (2.3)</td>
              <td colspan="2">13 (3.3)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="4">BMI (kg/m<sup>2</sup>), median (Q1, Q3)</td>
              <td colspan="2">27.3 (23.7, 31.7)</td>
              <td colspan="2">27.3 (23.5, 31.0)</td>
              <td colspan="2">.59</td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>BMI (kg/m<sup>2</sup>), n (%)</bold>
              </td>
              <td>
                <italic>.001</italic>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Underweight</td>
              <td colspan="2">637 (2.6)</td>
              <td colspan="2">13 (3.3)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Normal</td>
              <td colspan="2">6193 (24.8)</td>
              <td colspan="2">108 (27.1)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Overweight</td>
              <td colspan="2">6518 (26.2)</td>
              <td colspan="2">123 (30.9)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Obese</td>
              <td colspan="2">6610 (26.5)</td>
              <td colspan="2">107 (26.9)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Missing</td>
              <td colspan="2">4966 (19.9)</td>
              <td colspan="2">47 (11.8)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>Insurance<sup>b</sup>, n (%)</bold>
              </td>
              <td>
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="8">
                <bold>Medicare</bold>
              </td>
              <td>.15</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>No</td>
              <td colspan="2">9803 (39.3)</td>
              <td colspan="2">142 (35.7)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Yes</td>
              <td colspan="2">15,121 (60.7)</td>
              <td colspan="2">256 (64.3)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="8">
                <bold>Medicaid</bold>
              </td>
              <td>.12</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>No</td>
              <td colspan="2">23,793 (95.5)</td>
              <td colspan="2">373 (93.7)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Yes</td>
              <td colspan="2">1131 (4.5)</td>
              <td colspan="2">25 (6.3)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="8">
                <bold>Commercial</bold>
              </td>
              <td>
                <italic>.04</italic>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>No</td>
              <td colspan="2">20,194 (81)</td>
              <td colspan="2">306 (76.9)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Yes</td>
              <td colspan="2">4730 (19)</td>
              <td colspan="2">92 (23.1)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="8">
                <bold>Exchange</bold>
              </td>
              <td>.79</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>No</td>
              <td colspan="2">24,437 (98)</td>
              <td colspan="2">389 (97.7)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Yes</td>
              <td colspan="2">487 (2)</td>
              <td colspan="2">9 (2.3)</td>
              <td colspan="3">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>Primary stroke type<sup>c</sup>, n (%)</bold>
              </td>
              <td>
                <italic>&#60;.001</italic>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Subarachnoid hemorrhage</td>
              <td colspan="2">1307 (5.6)</td>
              <td colspan="2">29 (7.7)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Transient ischemic attack</td>
              <td colspan="2">7084 (30.5)</td>
              <td colspan="2">100 (26.5)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Intracerebral hemorrhage</td>
              <td colspan="2">2208 (9.5)</td>
              <td colspan="2">59 (15.6)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Acute ischemic stroke</td>
              <td colspan="2">12,623 (54.4)</td>
              <td colspan="2">189 (50.1)</td>
              <td colspan="2">
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>NIHSS<sup>d</sup> Stroke Scale for acute ischemic stroke, median (Q1, Q3)</bold>
              </td>
              <td>
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Average NIHSS in 6 hours</td>
              <td colspan="2">3.0 (1.0, 7.0)</td>
              <td colspan="2">3.0 (1.5, 9.0)</td>
              <td colspan="2">.09</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Average NIHSS in 12 hours</td>
              <td colspan="2">3.0 (1.0, 7.0)</td>
              <td colspan="2">3.0 (1.0, 8.0)</td>
              <td colspan="2">.24</td>
            </tr>
            <tr valign="top">
              <td colspan="9">
                <bold>NIHSS Stroke Scale for intracerebral hemorrhage, median (Q1, Q3)</bold>
              </td>
              <td>
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Average NIHSS in 6 hours</td>
              <td colspan="2">7.0 (2.0, 19.0)</td>
              <td colspan="2">6 (1.5, 18.0)</td>
              <td colspan="2">.94</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Average NIHSS in 12 hours</td>
              <td colspan="2">7.0 (2.0, 19.0)</td>
              <td colspan="2">7.0 (2.0, 18.0)</td>
              <td colspan="2">.81</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>CT: computed tomography.</p>
          </fn>
          <fn id="table2fn2">
            <p><sup>b</sup>Insurance type was collected throughout all imaging encounters.</p>
          </fn>
          <fn id="table2fn3">
            <p><sup>c</sup>For patients with multiple stroke visits, the initial encounter’s stroke scale and primary stroke type are presented. We perform hypothesis testing to compare the 398 sampled patients with the nonsampled population. Chi-square tests were adopted for categorical variables, and Kruskal-Wallis tests were adopted for continuous variables.</p>
          </fn>
          <fn id="table2fn4">
            <p><sup>d</sup>NIHSS: National Institutes of Health Stroke Scale.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Final natural language processing model evaluation with the evaluation set (N=200) at the imaging note level.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="220"/>
          <col width="90"/>
          <col width="90"/>
          <col width="90"/>
          <col width="90"/>
          <col width="210"/>
          <col width="210"/>
          <thead>
            <tr valign="bottom">
              <td>Stroke feature</td>
              <td>Specificity</td>
              <td>Precision</td>
              <td>Recall</td>
              <td><italic>F</italic><sub>1</sub>-score</td>
              <td>AUROC<sup>a</sup> (95% CI)</td>
              <td>Accuracy (95% CI)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Midline shift</td>
              <td>1</td>
              <td>1</td>
              <td>0.9375</td>
              <td>0.9677</td>
              <td>0.9973 (0.9792-1.0154)</td>
              <td>0.9950 (0.9852-1.0048)</td>
            </tr>
            <tr valign="top">
              <td>Perihematomal edema</td>
              <td>0.9945</td>
              <td>0.9474</td>
              <td>0.9474</td>
              <td>0.9474</td>
              <td>0.9994 (0.9917-1.0071)</td>
              <td>0.9900 (0.9762-1.0038)</td>
            </tr>
            <tr valign="top">
              <td>White matter hyperintensity</td>
              <td>0.9725</td>
              <td>0.9667</td>
              <td>0.956</td>
              <td>0.9613</td>
              <td>0.9704 (0.9452-0.9955)</td>
              <td>0.9650 (0.9395-0.9905)</td>
            </tr>
            <tr valign="top">
              <td>Lacunes</td>
              <td>1</td>
              <td>1</td>
              <td>1</td>
              <td>1</td>
              <td>1.0000 (1.0000-1.0000)</td>
              <td>1.0000 (1.0000-1.0000)</td>
            </tr>
            <tr valign="top">
              <td>Old stroke</td>
              <td>0.9581</td>
              <td>0.8056</td>
              <td>0.8788</td>
              <td>0.8406</td>
              <td>0.9693 (0.9277-1.0110)</td>
              <td>0.9450 (0.9134-0.9766)</td>
            </tr>
            <tr valign="top">
              <td>Subacute infarct</td>
              <td>0.9945</td>
              <td>0.9091</td>
              <td>0.5556</td>
              <td>0.6897</td>
              <td>0.9789 (0.9321-1.0258)</td>
              <td>0.9550 (0.9263-0.9837)</td>
            </tr>
            <tr valign="top">
              <td>Cerebral atrophy</td>
              <td>0.9173</td>
              <td>0.8571</td>
              <td>0.9851</td>
              <td>0.9167</td>
              <td>0.9673 (0.9369-0.9978)</td>
              <td>0.9400 (0.9071-0.9729)</td>
            </tr>
            <tr valign="top">
              <td>Intraventricular hemorrhage</td>
              <td>0.984</td>
              <td>0.7273</td>
              <td>0.6154</td>
              <td>0.6667</td>
              <td>0.9798 (0.9259-1.0338)</td>
              <td>0.9600 (0.9328-0.9872)</td>
            </tr>
            <tr valign="top">
              <td>Acute ischemia</td>
              <td>0.956</td>
              <td>0.6364</td>
              <td>0.7778</td>
              <td>0.7</td>
              <td>0.9362 (0.8570-1.0154)</td>
              <td>0.9400 (0.9071-0.9729)</td>
            </tr>
            <tr valign="top">
              <td>Intracerebral hemorrhage</td>
              <td>0.9665</td>
              <td>0.75</td>
              <td>0.8571</td>
              <td>0.8</td>
              <td>0.9872 (0.9532-1.0212)</td>
              <td>0.9550 (0.9263-0.9837)</td>
            </tr>
            <tr valign="top">
              <td>Subarachnoid hemorrhage</td>
              <td>1</td>
              <td>1</td>
              <td>0.8333</td>
              <td>0.9091</td>
              <td>1.0000 (1.0000-1.0000)</td>
              <td>0.9900 (0.9762-1.0038)</td>
            </tr>
            <tr valign="top">
              <td>Encephalomalacia</td>
              <td>1</td>
              <td>1</td>
              <td>0.9524</td>
              <td>0.9756</td>
              <td>0.9989 (0.9890-1.0088)</td>
              <td>0.9950 (0.9852-1.0048)</td>
            </tr>
            <tr valign="top">
              <td>Mass effect</td>
              <td>0.9777</td>
              <td>0.84</td>
              <td>1</td>
              <td>0.913</td>
              <td>0.9952 (0.9743-1.0161)</td>
              <td>0.9800 (0.9606-0.9994)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table3fn1">
            <p><sup>a</sup>AUROC: area under receiver operating characteristic curve.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table4">
        <label>Table 4</label>
        <caption>
          <p>Average natural language processing model evaluation metrics among 13 stroke features for the fine-tuned models.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <thead>
            <tr valign="bottom">
              <td>Stroke feature</td>
              <td><italic>F</italic><sub>1</sub>-score, mean (SD)</td>
              <td>AUROC<sup>a</sup>, mean (SD)</td>
              <td>Accuracy, mean (SD)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>HeadCT_BERT (final model)</td>
              <td>
                <italic>0.8683 (0.1176)<sup>b</sup></italic>
              </td>
              <td>
                <italic>0.9831 (0.0189)<sup>b</sup></italic>
              </td>
              <td>
                <italic>0.9700 (0.0225)<sup>b</sup></italic>
              </td>
            </tr>
            <tr valign="top">
              <td>ClinicalBERT (baseline model)</td>
              <td>0.8564 (0.1173)</td>
              <td>0.9786 (0.0216)</td>
              <td>0.9665 (0.0237)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table4fn1">
            <p><sup>a</sup>AUROC: area under receiver operating characteristic curve.</p>
          </fn>
          <fn id="table4fn2">
            <p><sup>b</sup>Italicized values denote performance of the proposed model.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table5">
        <label>Table 5</label>
        <caption>
          <p>Natural language processing (NLP) model generating structured stroke feature data sets from imaging notes<sup>a</sup>.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <thead>
            <tr valign="top">
              <td>Characteristics</td>
              <td>Head CT<sup>b</sup> imaging patients<sup>c</sup> (N=24924), n (%)</td>
              <td>Acute ischemic stroke admission initial CT<sup>d</sup> (N=6463), n (%)</td>
              <td>Intracerebral hemorrhage admission initial CT<sup>e</sup> (N=1243), n (%)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>White matter hyperintensity</td>
              <td>16,014 (64.3)</td>
              <td>3429 (53.1)</td>
              <td>407 (32.7)</td>
            </tr>
            <tr valign="top">
              <td>Cerebral atrophy</td>
              <td>13,615 (54.6)</td>
              <td>2262 (35)</td>
              <td>268 (21.6)</td>
            </tr>
            <tr valign="top">
              <td>Old stroke</td>
              <td>7426 (29.8)</td>
              <td>1324 (20.5)</td>
              <td>91 (7.3)</td>
            </tr>
            <tr valign="top">
              <td>Lacunes</td>
              <td>6622 (26.6)</td>
              <td>1386 (21.4)</td>
              <td>116 (9.3)</td>
            </tr>
            <tr valign="top">
              <td>Mass effect</td>
              <td>3826 (15.4)</td>
              <td>614 (9.5)</td>
              <td>500 (40.2)</td>
            </tr>
            <tr valign="top">
              <td>Intracerebral hemorrhage</td>
              <td>3822 (15.3)</td>
              <td>354 (5.5)</td>
              <td>1096 (88.2)</td>
            </tr>
            <tr valign="top">
              <td>Perihematomal edema</td>
              <td>3600 (14.4)</td>
              <td>436 (6.7)</td>
              <td>623 (50.1)</td>
            </tr>
            <tr valign="top">
              <td>Encephalomalacia</td>
              <td>3453 (13.9)</td>
              <td>373 (5.8)</td>
              <td>50 (4)</td>
            </tr>
            <tr valign="top">
              <td>Acute ischemia</td>
              <td>3426 (13.7)</td>
              <td>1173 (18.1)</td>
              <td>33 (2.7)</td>
            </tr>
            <tr valign="top">
              <td>Subacute infarct</td>
              <td>2675 (10.7)</td>
              <td>841 (13)</td>
              <td>28 (2.3)</td>
            </tr>
            <tr valign="top">
              <td>subarachnoid hemorrhage</td>
              <td>2179 (8.7)</td>
              <td>132 (2)</td>
              <td>245 (19.7)</td>
            </tr>
            <tr valign="top">
              <td>Midline shift</td>
              <td>1908 (7.7)</td>
              <td>184 (2.8)</td>
              <td>345 (27.8)</td>
            </tr>
            <tr valign="top">
              <td>Intraventricular hemorrhage</td>
              <td>1409 (5.7)</td>
              <td>37 (0.6)</td>
              <td>405 (32.6)</td>
            </tr>
            <tr valign="top">
              <td>Severe stroke features<sup>f</sup></td>
              <td>5146 (20.6)</td>
              <td>901 (13.9)</td>
              <td>845 (68)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table5fn1">
            <p><sup>a</sup>Our final NLP model processed 82,073 head computed tomography notes for 24,924 unique patients in the entire hospital system and generated structured data sets.</p>
          </fn>
          <fn id="table5fn2">
            <p><sup>b</sup>CT: computed tomography.</p>
          </fn>
          <fn id="table5fn3">
            <p><sup>c</sup>The stroke features in the overall population were aggregated at the patient level.</p>
          </fn>
          <fn id="table5fn4">
            <p><sup>d,e</sup>The stroke features in the initial head CT of acute ischemic stroke and intracerebral hemorrhage emergency admissions were presented.</p>
          </fn>
          <fn id="table5fn5">
            <p><sup>f</sup>Severe stroke features include midline shift, perihematomal edema, or mass effect. Severe stroke feature is a composite feature.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>Kaplan-Meier curve of survival probability from initial admissions. Patients whose initial imaging includes severe stroke features (eg, midline shift, mass effect, or perihematomal edema) had a lower survival probability. (A) Acute ischemic stroke admissions (<italic>P</italic>&#60;.001). (B) Intracerebral hemorrhage admissions (<italic>P</italic>=.19).</p>
        </caption>
        <graphic xlink:href="ai_v2i1e42884_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>We propose an NLP pipeline to extract ischemic and hemorrhagic stroke characteristics from head CT imaging notes (HeadCT_BERT model). Built upon one of the latest clinical NLP models, the HeadCT_BERT model achieved an excellent average AUROC of 0.9831 and an accuracy of 97%. Our NLP pipeline showed promising performance for the detection of midline shift, perihematomal edema, lacunes, subarachnoid hemorrhage, encephalomalacia, and mass effect, with AUROCs for each of these features exceeding 0.99 and <italic>F</italic><sub>1</sub>-scores above 0.9 for the evaluation set. Other features, including white matter hyperintensity, old stroke, subacute infarct, cerebral atrophy, intraventricular hemorrhage, and ICH showed AUROCs between 0.96 to 0.98. Other NLP studies have achieved optimal AUROC values of 0.9625 for mass effect extraction [<xref ref-type="bibr" rid="ref17">17</xref>], 0.96 for stroke presence, and 0.93 for stroke acuity [<xref ref-type="bibr" rid="ref1">1</xref>]. Our method achieved comparable or better performance for extracting stroke imaging features.</p>
        <p>In 2018 alone, 11.5 million head CT scans were performed in the United States [<xref ref-type="bibr" rid="ref27">27</xref>], generating valuable information that can be used to answer a multitude of stroke-related research questions. In the absence of methods to extract information in unstructured formats, the generation of insights from such sources is limited. This underscores the value of our NLP pipeline, which provides a fast, scalable, and automatic solution for the processing of unstructured text data.</p>
        <p>Application of our pipeline in a health care environment has the potential to benefit both medical research and patient safety. For example, in this study, we demonstrated the use of NLP for retrospectively identifying cohorts of patients with AIS and ICH with severe stroke features. We identified 901 (13.9%) AIS and 845 (68%) patients with ICH with severe stroke neuroimaging features and demonstrated lower survival rates for patients with these severe features, consistent with previous studies [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Beyond outcome prediction, modifications of our pipeline may also be implemented to improve patient safety. For example, NLP pipelines that detect incidents can be used to improve patient outreach workflows by optimizing reporting procedures for health care providers as well as the patients and their families [<xref ref-type="bibr" rid="ref30">30</xref>]. Our pipeline has the potential to process imaging notes in real time, generate flags for severe stroke findings, and trigger reminders and alerts within the EHR system.</p>
        <p>Despite the performance of our NLP pipeline, this study has limitations. First, it was conducted and evaluated in a single organization, where many of the notes may have been written by a relatively small number of radiologists or neuroradiologists. Therefore, the generalizability of the trained NLP models could be limited by overly consistent wording and grammar in training data. However, as one of the largest hospital systems, comprising 7 certified stroke care hospitals in the Houston metropolitan area, we feel that our inclusion of a diverse collection of notes yields enough variability in the training data to mitigate this issue. Second, although our HeadCT_BERT model demonstrated slightly improved performance for stroke features extraction, it is hard to compare our model with ClinicalBERT due to the lack of well-established NLP benchmarks for head imaging reports. Future efforts to create head imaging NLP benchmarks are needed for comprehensive evaluation. Finally, the probability calibration curves of both HeadCT_BERT and ClinicalBERT for individual stroke features demonstrate a mixed performance in calibration, indicating potential imbalance of certain stroke features in the training data set. As a result, using a probability of .5 as a general cut-off might not be optimal for all stroke features. Future work is required to adequately calibrate the model for all stroke features.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study represents a step forward in NLP adoption for neuroimaging among patients with cerebrovascular disease. Our work demonstrates an effective and customizable NLP pipeline for retrieving multiple stroke features from large amounts of unstructured imaging notes. Derived from the latest artificial intelligence technology, we believe our model will benefit stroke research and patient safety. To fully understand the impact on the health care industry, future work in the data pipeline deployment and evaluation is anticipated.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary tables and figures.</p>
        <media xlink:href="ai_v2i1e42884_app1.docx" xlink:title="DOCX File , 898 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AIS</term>
          <def>
            <p>acute ischemic stroke</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUROC</term>
          <def>
            <p>area under receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CT</term>
          <def>
            <p>computed tomography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ICH</term>
          <def>
            <p>intracerebral hemorrhage</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">MLM</term>
          <def>
            <p>masked language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MRI</term>
          <def>
            <p>magnetic resonance imaging</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">REINAH</term>
          <def>
            <p>Registry for Neurological Endpoint Assessments among Patients with Ischemic and Hemorrhagic Stroke</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">SVM</term>
          <def>
            <p>Support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This project is supported by the Center for Health Data Science and Analytics, Department of Neurosurgery, and the Neurological Institute at Houston Methodist.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>EH conceived the study and performed data analysis and Natural language processing modeling. ATB and TP helped with manual annotation. All authors contributed to the manuscript writing.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Orfanoudaki</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Caprasse</surname>
              <given-names>FPM</given-names>
            </name>
            <name name-style="western">
              <surname>Hutch</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fard</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Balogun</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>MI</given-names>
            </name>
            <name name-style="western">
              <surname>Minnig</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Saglam</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Prescott</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Greer</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Smirnakis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bertsimas</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Machine learning and natural language processing methods to identify ischemic stroke, acuity and location from radiology reports</article-title>
          <source>PLoS One</source>
          <year>2020</year>
          <volume>15</volume>
          <issue>6</issue>
          <fpage>e0234908</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0234908"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0234908</pub-id>
          <pub-id pub-id-type="medline">32559211</pub-id>
          <pub-id pub-id-type="pii">PONE-D-19-31481</pub-id>
          <pub-id pub-id-type="pmcid">PMC7304623</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grivas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Alex</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grover</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tobin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Whiteley</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Not a cute stroke: analysis of rule- and neural network-based information extraction systems for brain radiology reports</article-title>
          <source>Proceedings of the 11th International Workshop on Health Text Mining and Information Analysis</source>
          <year>2020</year>
          <fpage>24</fpage>
          <lpage>37</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2020.louhi-1.4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garg</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Naidech</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kording</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Prabhakaran</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Automating ischemic stroke subtype classification using machine learning and natural language processing</article-title>
          <source>J Stroke Cerebrovasc Dis</source>
          <year>2019</year>
          <month>07</month>
          <volume>28</volume>
          <issue>7</issue>
          <fpage>2045</fpage>
          <lpage>2051</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jstrokecerebrovasdis.2019.02.004</pub-id>
          <pub-id pub-id-type="medline">31103549</pub-id>
          <pub-id pub-id-type="pii">S1052-3057(19)30048-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sorin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Barash</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Konen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Klang</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Deep learning for natural language processing in radiology-fundamentals and a systematic review</article-title>
          <source>J Am Coll Radiol</source>
          <year>2020</year>
          <month>05</month>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>639</fpage>
          <lpage>648</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jacr.2019.12.026</pub-id>
          <pub-id pub-id-type="medline">32004480</pub-id>
          <pub-id pub-id-type="pii">S1546-1440(20)30003-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>AYX</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>ZA</given-names>
            </name>
            <name name-style="western">
              <surname>Pou-Prom</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lopes</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kapral</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Aviv</surname>
              <given-names>RI</given-names>
            </name>
            <name name-style="western">
              <surname>Mamdani</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Automating stroke data extraction from free-text radiology reports using natural language processing: instrument validation study</article-title>
          <source>JMIR Med Inform</source>
          <year>2021</year>
          <month>05</month>
          <day>04</day>
          <volume>9</volume>
          <issue>5</issue>
          <fpage>e24381</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2021/5/e24381/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/24381</pub-id>
          <pub-id pub-id-type="medline">33944791</pub-id>
          <pub-id pub-id-type="pii">v9i5e24381</pub-id>
          <pub-id pub-id-type="pmcid">PMC8132979</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wheater</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mair</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sudlow</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Alex</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Grover</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Whiteley</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>A validated natural language processing algorithm for brain imaging phenotypes from radiology reports in UK electronic health records</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2019</year>
          <month>09</month>
          <day>09</day>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>184</fpage>
          <pub-id pub-id-type="doi">10.1186/s12911-019-0908-7</pub-id>
          <pub-id pub-id-type="medline">31500613</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-019-0908-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6734359</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berman</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Biery</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Ginder</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hulme</surname>
              <given-names>OL</given-names>
            </name>
            <name name-style="western">
              <surname>Marcusa</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Leiva</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Cardin</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hainer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatt</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Di Carli</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Turchin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Blankstein</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing for the assessment of cardiovascular disease comorbidities: the Cardio-Canary comorbidity project</article-title>
          <source>Clin Cardiol</source>
          <year>2021</year>
          <month>09</month>
          <volume>44</volume>
          <issue>9</issue>
          <fpage>1296</fpage>
          <lpage>1304</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34347314"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/clc.23687</pub-id>
          <pub-id pub-id-type="medline">34347314</pub-id>
          <pub-id pub-id-type="pmcid">PMC8428009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Lang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Buch</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Rincon</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mehan</surname>
              <given-names>WA</given-names>
            </name>
            <name name-style="western">
              <surname>Leslie-Mazwi</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Kalpathy-Cramer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Analysis of stroke detection during the COVID-19 pandemic using natural language processing of radiology reports</article-title>
          <source>AJNR Am J Neuroradiol</source>
          <year>2021</year>
          <month>03</month>
          <volume>42</volume>
          <issue>3</issue>
          <fpage>429</fpage>
          <lpage>434</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.ajnr.org/cgi/pmidlookup?view=long&#38;pmid=33334851"/>
          </comment>
          <pub-id pub-id-type="doi">10.3174/ajnr.A6961</pub-id>
          <pub-id pub-id-type="medline">33334851</pub-id>
          <pub-id pub-id-type="pii">ajnr.A6961</pub-id>
          <pub-id pub-id-type="pmcid">PMC7959438</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Obeid</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lenert</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing and machine learning algorithm to identify brain MRI reports with acute ischemic stroke</article-title>
          <source>PLoS One</source>
          <year>2019</year>
          <volume>14</volume>
          <issue>2</issue>
          <fpage>e0212778</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0212778"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0212778</pub-id>
          <pub-id pub-id-type="medline">30818342</pub-id>
          <pub-id pub-id-type="pii">PONE-D-18-24904</pub-id>
          <pub-id pub-id-type="pmcid">PMC6394972</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heo</surname>
              <given-names>TS</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Seo</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Jeon</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Prediction of stroke outcome using natural language processing-based machine learning of radiology report of brain MRI</article-title>
          <source>J Pers Med</source>
          <year>2020</year>
          <month>12</month>
          <day>16</day>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>286</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=jpm10040286"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/jpm10040286</pub-id>
          <pub-id pub-id-type="medline">33339385</pub-id>
          <pub-id pub-id-type="pii">jpm10040286</pub-id>
          <pub-id pub-id-type="pmcid">PMC7766032</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Kafiabadi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Al Busaidi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Guilhem</surname>
              <given-names>EL</given-names>
            </name>
            <name name-style="western">
              <surname>Lynch</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Townend</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Montvila</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kiik</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Siddiqui</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gadapa</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Benger</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Mazumder</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barker</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ourselin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cole</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Booth</surname>
              <given-names>TC</given-names>
            </name>
          </person-group>
          <article-title>Deep learning to automate the labelling of head MRI datasets for computer vision applications</article-title>
          <source>Eur Radiol</source>
          <year>2022</year>
          <month>01</month>
          <volume>32</volume>
          <issue>1</issue>
          <fpage>725</fpage>
          <lpage>736</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34286375"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00330-021-08132-0</pub-id>
          <pub-id pub-id-type="medline">34286375</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-021-08132-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC8660736</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online May 24, 2019. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1810.04805"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Publicly available clinical BERT embeddings</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online June 6, 2019. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1904.03323"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1909</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smit</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pareek</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lungren</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>CheXbert: Combining automatic labelers and expert annotations for accurate radiology report labeling using BERT</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online Apr 20, 2022. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2004.09167"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.117</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>BoneBert: A BERT-based automated information extraction system of radiology reports for bone fracture detection and diagnosis</article-title>
          <source>IDA 2021: Advances in Intelligent Data Analysis XIX</source>
          <year>2021</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer, Cham</publisher-name>
          <fpage>263</fpage>
          <lpage>274</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pruitt</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Naidech</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Van Ornam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Borczuk</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>A natural language processing algorithm to extract characteristics of subdural hematoma from head CT reports</article-title>
          <source>Emerg Radiol</source>
          <year>2019</year>
          <month>06</month>
          <volume>26</volume>
          <issue>3</issue>
          <fpage>301</fpage>
          <lpage>306</lpage>
          <pub-id pub-id-type="doi">10.1007/s10140-019-01673-4</pub-id>
          <pub-id pub-id-type="medline">30693414</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10140-019-01673-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gordon</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Banerjee</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Block</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Winstead-Derlega</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Mitarai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jarrett</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sanyal</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Wintermark</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kohn</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Natural language processing of head CT reports to identify intracranial mass effect: CTIME algorithm</article-title>
          <source>Am J Emerg Med</source>
          <year>2022</year>
          <month>01</month>
          <volume>51</volume>
          <fpage>388</fpage>
          <lpage>392</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ajem.2021.11.001</pub-id>
          <pub-id pub-id-type="medline">34839182</pub-id>
          <pub-id pub-id-type="pii">S0735-6757(21)00905-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Olthof</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Shouche</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fennema</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>IJpma</surname>
              <given-names>FFA</given-names>
            </name>
            <name name-style="western">
              <surname>Koolstra</surname>
              <given-names>RHC</given-names>
            </name>
            <name name-style="western">
              <surname>Stirler</surname>
              <given-names>VMA</given-names>
            </name>
            <name name-style="western">
              <surname>van Ooijen</surname>
              <given-names>PMA</given-names>
            </name>
            <name name-style="western">
              <surname>Cornelissen</surname>
              <given-names>LJ</given-names>
            </name>
          </person-group>
          <article-title>Machine learning based natural language processing of radiology reports in orthopaedic trauma</article-title>
          <source>Comput Methods Programs Biomed</source>
          <year>2021</year>
          <month>09</month>
          <volume>208</volume>
          <fpage>106304</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0169-2607(21)00378-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.cmpb.2021.106304</pub-id>
          <pub-id pub-id-type="medline">34333208</pub-id>
          <pub-id pub-id-type="pii">S0169-2607(21)00378-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Kades</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bischoff</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Moll</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schnell</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Küchler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Köhler</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Sellner</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Heussel</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Kauczor</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Schlemmer</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Maier-Hein</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Kleesiek</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep learning-based assessment of oncologic outcomes from natural language processing of structured radiology reports</article-title>
          <source>Radiol Artif Intell</source>
          <year>2022</year>
          <month>09</month>
          <volume>4</volume>
          <issue>5</issue>
          <fpage>e220055</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36204531"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/ryai.220055</pub-id>
          <pub-id pub-id-type="medline">36204531</pub-id>
          <pub-id pub-id-type="pmcid">PMC9530771</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Potter</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Pratap</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nicolas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Alan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bako</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jefferson</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Adegbindin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Baig</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Willingham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Britz</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Tannous</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Farhaan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A neuro-informatics pipeline to support a learning healthcare system for populations with cerebrovascular disease: rationale and design for a registry across an 8-hospital tertiary healthcare system in the greater Houston metropolitan area</article-title>
          <source>JMIR preprints</source>
          <comment>Preprint posted online on June 30, 2022</comment>
          <pub-id pub-id-type="doi">10.2196/preprints.40639</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ratinov</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Roth</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Design challenges and misconceptions in named entity recognition</article-title>
          <source>Proceedings of the Thirteenth Conference on Computational Natural Language Learning (CoNLL-2009)</source>
          <year>2009</year>
          <conf-name>CoNLL '09</conf-name>
          <conf-date>June 4-5</conf-date>
          <conf-loc>Boulder, CO</conf-loc>
          <fpage>147</fpage>
          <lpage>155</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W09-1119"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1596374.1596399</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mozayan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fabbri</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Maneevese</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tocino</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chheang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Practical guide to natural language processing for radiology</article-title>
          <source>Radiographics</source>
          <year>2021</year>
          <month>09</month>
          <volume>41</volume>
          <issue>5</issue>
          <fpage>1446</fpage>
          <lpage>1453</lpage>
          <pub-id pub-id-type="doi">10.1148/rg.2021200113</pub-id>
          <pub-id pub-id-type="medline">34469212</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Schuster</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Norouzi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Macherey</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Krikun</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Macherey</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Klingner</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>?</given-names>
            </name>
            <name name-style="western">
              <surname>Gouws</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kato</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kudo</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kazawa</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Stevens</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kurian</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Patil</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Young</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Riesa</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rudnick</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vinyals</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hughes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Google’s neural machine translation system: bridging the gap between human and machine translation</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online Oct 8, 2016. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1609.08144"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/tacl_a_00065</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stoyanov</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online July 26, 2019. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1907.11692"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title>
          <source>Bioinformatics</source>
          <year>2020</year>
          <month>02</month>
          <day>15</day>
          <volume>36</volume>
          <issue>4</issue>
          <fpage>1234</fpage>
          <lpage>1240</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31501885"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id>
          <pub-id pub-id-type="medline">31501885</pub-id>
          <pub-id pub-id-type="pii">5566506</pub-id>
          <pub-id pub-id-type="pmcid">PMC7703786</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Goodacre</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>On splitting training and validation set: a comparative study of cross-validation, bootstrap and systematic sampling for estimating the generalization performance of supervised learning</article-title>
          <source>J Anal Test</source>
          <year>2018</year>
          <month>10</month>
          <day>29</day>
          <volume>2</volume>
          <issue>3</issue>
          <fpage>249</fpage>
          <lpage>262</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30842888"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s41664-018-0068-2</pub-id>
          <pub-id pub-id-type="medline">30842888</pub-id>
          <pub-id pub-id-type="pii">68</pub-id>
          <pub-id pub-id-type="pmcid">PMC6373628</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cauley</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fielden</surname>
              <given-names>SW</given-names>
            </name>
          </person-group>
          <article-title>Head CT: toward making full use of the information the X-rays give</article-title>
          <source>AJNR Am J Neuroradiol</source>
          <year>2021</year>
          <month>08</month>
          <volume>42</volume>
          <issue>8</issue>
          <fpage>1362</fpage>
          <lpage>1369</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.ajnr.org/cgi/pmidlookup?view=long&#38;pmid=34140278"/>
          </comment>
          <pub-id pub-id-type="doi">10.3174/ajnr.A7153</pub-id>
          <pub-id pub-id-type="medline">34140278</pub-id>
          <pub-id pub-id-type="pii">ajnr.A7153</pub-id>
          <pub-id pub-id-type="pmcid">PMC8367614</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nag</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Khandakar</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Prediction of clinical outcome in acute hemorrhagic stroke from a single CT Scan on admission</article-title>
          <source>N Am J Med Sci</source>
          <year>2012</year>
          <month>10</month>
          <volume>4</volume>
          <issue>10</issue>
          <fpage>463</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.najms.org/article.asp?issn=1947-2714;year=2012;volume=4;issue=10;spage=463;epage=467;aulast=Nag"/>
          </comment>
          <pub-id pub-id-type="doi">10.4103/1947-2714.101986</pub-id>
          <pub-id pub-id-type="medline">23112967</pub-id>
          <pub-id pub-id-type="pii">NAJMS-4-463</pub-id>
          <pub-id pub-id-type="pmcid">PMC3482777</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Daverat</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Castel</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Dartigues</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Orgogozo</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Death and functional outcome after spontaneous intracerebral hemorrhage. A prospective study of 166 cases using multivariate analysis</article-title>
          <source>Stroke</source>
          <year>1991</year>
          <month>01</month>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1161/01.str.22.1.1</pub-id>
          <pub-id pub-id-type="medline">1987664</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Canton</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Dadashzadeh</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Yip</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Forsythe</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Handzel</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Automatic detection of thyroid and adrenal incidentals using radiology reports and deep learning</article-title>
          <source>J Surg Res</source>
          <year>2021</year>
          <month>10</month>
          <volume>266</volume>
          <fpage>192</fpage>
          <lpage>200</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jss.2021.03.060</pub-id>
          <pub-id pub-id-type="medline">34020097</pub-id>
          <pub-id pub-id-type="pii">S0022-4804(21)00227-4</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
