<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e68212</article-id><article-id pub-id-type="doi">10.2196/68212</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Supervised Natural Language Processing Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Parker</surname><given-names>Susan T</given-names></name><degrees>MS, MPP, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Feinberg School of Medicine, Northwestern University</institution><addr-line>750 N Lakeshore</addr-line><addr-line>Chicago</addr-line><addr-line>IL</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Emam</surname><given-names>Khaled El</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Arseniev-Koehler</surname><given-names>Alina</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Bowen</surname><given-names>Daniel</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Susan T Parker, MS, MPP, PhD, Feinberg School of Medicine, Northwestern University, 750 N Lakeshore, Chicago, IL, 60611, United States, 1 2487613116; <email>susan.parker@northwestern.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>19</day><month>6</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e68212</elocation-id><history><date date-type="received"><day>30</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>19</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>15</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Susan T Parker. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 19.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e68212"/><abstract><sec><title>Background</title><p>The recent availability of law enforcement and coroner or medical examiner reports for nearly every violent death in the United States expands the potential for natural language processing (NLP) research into violence.</p></sec><sec><title>Objective</title><p>The objective of this work is to assess applications of supervised NLP to unstructured data in the National Violent Death Reporting System to predict circumstances and types of violent death.</p></sec><sec sec-type="methods"><title>Methods</title><p>This analysis applied distilBERT, a compact large language model (LLM) with fewer parameters relative to full-scale LLMs, to unstructured narrative data to simulate the impacts of preprocessing, volume, and composition of training data on model performance, evaluated by <italic>F</italic><sub>1</sub>-scores, precision, recall, and the false negative rate. Model performance was evaluated for bias by race, ethnicity, and sex by comparing <italic>F</italic><sub>1</sub>-scores across subgroups.</p></sec><sec sec-type="results"><title>Results</title><p>A minimum training set of 1500 cases was necessary to achieve an <italic>F</italic><sub>1</sub>-score of 0.6 and a false negative rate of 0.01-0.05 with a compact LLM. Replacement of domain-specific jargon improved model performance, while oversampling positive class cases to address class imbalance did not substantially improve <italic>F</italic><sub>1</sub>-scores. Between racial and ethnic groups, <italic>F</italic><sub>1</sub>-score disparities ranged from 0.2 to 0.25, and between male and female decedents, differences ranged from 0.12 to 0.2.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Compact LLMs with sufficient training data can be applied to supervised NLP tasks with a class imbalance in the National Violent Death Reporting System. Simulations of supervised text classification across the model-fitting process of preprocessing and training compact LLM-informed NLP applications to unstructured death narrative data.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>violence</kwd><kwd>informatics</kwd><kwd>text classification</kwd><kwd>simulation</kwd><kwd>violent death</kwd><kwd>narrative</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>injury prevention</kwd><kwd>violent injury</kwd><kwd>coroner reports</kwd><kwd>police report</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Violent injuries are among the leading causes of death in the United States for individuals younger than the age of 44 years and are leading causes for young people aged 10&#x2010;34 years [<xref ref-type="bibr" rid="ref1">1</xref>]. The most comprehensive and detailed source of data on violent deaths in the United States is the National Violent Death Reporting System (NVDRS), aggregating information from death certificates, coroner or medical examiner reports, and law enforcement (LE) reports to characterize violent deaths [<xref ref-type="bibr" rid="ref2">2</xref>]. Researchers have used structured data from NVDRS extensively to characterize the epidemiology of violent deaths including homicides [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>], suicides [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref10">10</xref>], and those that result from legal intervention (police shootings) [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>NVDRS has been widely used for its structured data, which captures information such as decedent characteristics, weapons, circumstances, and suspect information [<xref ref-type="bibr" rid="ref12">12</xref>], and increasing attention has been given to the vast amounts of unstructured text data embedded within the narrative reports. Narratives provide rich details about the incident not necessarily captured in structured variables, such as nuanced descriptions of precipitating events and other contextual factors that are difficult to quantify.</p><p>A range of approaches have been applied to the use of NVDRS narratives in research on violent deaths. According to a recent review on the research use of textual NVDRS narratives over the past 2 decades, most studies used manual review or keyword searches of narratives [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], while 5% used machine learning tools designed to analyze unstructured text, known as natural language processing (NLP) [<xref ref-type="bibr" rid="ref15">15</xref>]. Applications of NLP have included supervised learning tasks, such as classification, as well as unsupervised tasks, such as topic modeling. For instance, supervised NLP has been used to classify suicide related to driving cessation [<xref ref-type="bibr" rid="ref16">16</xref>] and assisted living facilities [<xref ref-type="bibr" rid="ref17">17</xref>], examine suicide intent classification [<xref ref-type="bibr" rid="ref18">18</xref>] and intimate partner homicide [<xref ref-type="bibr" rid="ref19">19</xref>], and predict drug overdose deaths [<xref ref-type="bibr" rid="ref20">20</xref>]. Latent class analysis has been used to reveal salient topics unrepresented in abstractor classification [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] and themes in youth suicide [<xref ref-type="bibr" rid="ref23">23</xref>]. Most recently, researchers have used NLP to classify social determinants of health in suicide narratives [<xref ref-type="bibr" rid="ref24">24</xref>] and inconsistencies, biases, and missing data in the narratives themselves [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>Continued application of NLP to NVDRS is particularly important because the volume of NVDRS data will substantially increase over time. NVDRS has gathered data on over 500,000 deaths since 2003 and will grow by approximately 100,000 records annually moving forward as additional states and counties participate, underlining the importance of efficiently investigating research questions using NVDRS narratives and NLP methods.</p><p>Although large language models (LLMs) have generally performed better than other NLP approaches to narrative data in medical informatics domains, fewer applications of LLMs to NVDRS exist [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. Applications of NLP to a related text narrative type, clinical notes from medical providers, have identified patient self-harm [<xref ref-type="bibr" rid="ref29">29</xref>-<xref ref-type="bibr" rid="ref34">34</xref>] and violence-related [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref38">38</xref>] outcomes often using LLMs or deep learning approaches. In part, researchers and practitioners may face particular challenges applying LLMs to NVDRS. One important challenge is that many outcomes of interest are likely to be infrequent or rare events that can present classification challenges due to sparse information about the outcome [<xref ref-type="bibr" rid="ref39">39</xref>-<xref ref-type="bibr" rid="ref41">41</xref>]. Further, NVDRS narratives are composed of police and coroner reports, which contain domain-specific language or jargon, such as the use of <italic>ICD</italic> (<italic>International Classification of Disease</italic>) codes. NVDRS data restrictions on sensitive data do not permit narratives to be stored in the cloud, thus limiting access to computing resources that are often used to train or fine-tune LLMs. Fourth, researchers documented racial disparities in narratives alongside gendered text differences in NVDRS [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Narratives involving victims from marginalized populations tend to be significantly shorter in length and are more likely to be missing altogether. These differences in data quality may result in models that generate predictions with similar patterns of subgroup bias.</p><p>To address these challenges, this paper conducts simulations of supervised text classification that span the machine learning pipeline, from data preprocessing and model training to the evaluation of predictions for potential racial or gender bias. Existing coded variables that record the type (eg, police shooting and drive-by shooting) or circumstances (number of nonfatal shooting victims and location of victim injuries) of the violent death are used as target outcomes used in simulations. Target outcomes with class imbalance were selected, as this setting is likely of most use to NVDRS applications, and models were fit using a compact LLM to reflect settings where computing resources are limited. By conducting simulations, this analysis aims to inform future applications of supervised classification using LLMs to NVDRS by establishing concrete benchmarks for understanding training data quantity, preprocessing needs, and to what extent NLP results in predictions reflecting existing racial or gender bias in narratives.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data</title><p>This analysis used violent death records from NVDRS data from 2015 to 2020. The NVDRS gathers information about violent deaths including homicides, suicides, and deaths caused by LE. NVDRS combines data from death certificates, coroner or medical reports, and LE reports, providing context about violent deaths including information about mental health conditions, toxicology results, and other circumstances in addition to details about victim characteristics. Trained medical abstractors code information from reports about violent deaths into the over 600 variables that comprise the NVDRS surveillance system [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>To obtain labeled outcomes for use as target outcomes in simulations, this analysis constructed measures from existing coded NVDRS variables that abstractors label. Because a substantial proportion of coded NVDRS fields group together case outcomes that are negative with those that are not known, this analysis instead relied on multinomial fields or combined separate NVDRS coded variables to obtain target outcomes for simulations. For instance, for case outcomes such as mental health crisis or drug involvement, outcomes are coded as &#x201C;Yes&#x201D; or as &#x201C;No, not available, unknown,&#x201D; which would not constitute a labeled outcome.</p><p>These constructed outcomes include 4 binary outcomes likely to be recorded accurately when known. The first outcome is whether or not a homicide is a legal intervention homicide, meaning the shooter was a LE officer. Literature suggests that these homicides are well-recorded in NVDRS and less subject to noisy labeling or measurement error [<xref ref-type="bibr" rid="ref11">11</xref>]. The second outcome is whether or not a homicide is classified as a drive-by shooting. The third outcome is whether a homicide occurred at home or not, and the fourth outcome is whether or not additional victims were nonfatally shot in the course of a homicide event. We constrain the sample to where the weapon type is listed as a firearm, and the abstractor manner of death is a homicide. Taken together, these outcomes represent a range of language complexity and frequency less subject to label noise by constructing outcomes.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The Northeastern University institutional review board deemed that this research did not require review, as it did not involve human participants.</p></sec><sec id="s2-3"><title>Statistical Analysis</title><p>This analysis compared model performance across 4 configurations of training data and text composition using a compact LLM. The configurations examined included preprocessing of text data as well as the amount and composition of the training data. Specifically, the analysis first varied the amount of training data that the model was fitted on to inform how much randomly sampled training data must be annotated to train an LLM to predict NVDRS outcomes. Second, because positive class cases were often infrequent, the analysis simulated the oversampling of positive class cases in training data. Specifically, oversampling included a larger proportion of additional positive class cases, holding the negative class cases constant, to inform what composition of training data was most effective to include as training data.</p><p>This analysis additionally simulated different preprocessing techniques for unstructured text data. NVDRS text may be domain-specific, as it comprises police and coroner reports, which use both jargon and abbreviation. To simulate the impacts of clarifying common abbreviations, this analysis replaced NVDRS abbreviations with unabbreviated text. For example, often when NVDRS abstractors referred to victims and suspects in the report narratives, the abbreviations &#x201C;v&#x201D; for victim and &#x201C;s&#x201D; for suspect appeared rather than the full word. Abbreviations referring to victims, suspects, police, and gunshot wounds were replaced (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Finally, the analysis simulated omitting coroner report text from the training data. Coroner reports may contain extraneous text such as toxicology reports that may be noisy in the context of prediction focused on criminal justice outcomes. Further, compact LLMs have limited token lengths, which constrain the number of words in an input narrative, and the combination of coroner and homicide reports can exceed the token length in some LLM applications. Because our outcomes are LE-focused, the analysis simulated the omission of potentially extraneous narrative information.</p><p>The analysis began by preprocessing the coroner and police narrative by removing special characters including numbers, punctuation, and capitalization as is standard. Police and coroner report narratives were combined into a single field in order to use the information available in both narratives (with the exception of the LE narrative&#x2013;only simulation).</p><p>Next, the analysis turned to creating simulated data. First, a test set on which the model outputs were to be evaluated was randomly selected. The test set consisted of a random sample of 30% of each outcome&#x2019;s records, which was then held out from any selection into the training data.</p><p>To vary the amounts of training data, the analysis used different training data record counts, each with a different amount of training data. These splits ranged from a minimum of 100 cases, increasing in increments to 200, 500, 1000, 1500, and up to 2000 cases. Each split was randomly sampled from the full dataset specified for each outcome so that each training split maintained a proportion of positive and negative cases that approximates the true proportion. The prior sample was included in the next iteration to isolate the impact of adding additional training data, not adding different training data. For instance, to obtain 500 cases, first, the prior 200 cases were preserved, and an additional 300 were sampled to comprise 500 cases.</p><p>To simulate the impacts of language replacement and LE-only text, the analysis followed the procedure process outlined earlier to randomly select training data in the same 100, 200, 500, 1000, 1500, and 2000 increments.</p><p>In the second configuration of training data, the composition of positive class cases was altered from the true proportion in the training data. Instead of randomly sampling cases, the proportion of positive class cases was increased in the training data by adding additional positive class cases to the negative class cases. The positive class cases were incrementally increased until they comprise 10%, 20%, 30%, 40%, and up to 50% of the training data starting from a baseline of 1000 cases, as lower amounts of training data were not performant in this application. For instance, to obtain training data composed of 10% positive class cases for legal intervention homicide, the process started with randomly sampled training data with 1000 records, of which 54 were legal intervention homicides and 940 were not. To the 940 negative class cases, 59 additional positive class cases were added so that the total number of positive class cases was 113 (54+59), and the total was 1059 cases, of which approximately 10% (113/1059) were legal intervention homicides.</p><p>For each of the configurations described earlier, distilBERT, an LLM with fewer parameters but comparable accuracy to large-scale LLMs, was used [<xref ref-type="bibr" rid="ref41">41</xref>]. Compact LLMs better allow for simulations because of fewer computational needs and because NVDRS data restrictions do not permit cloud storage and computing. The distilBERT models were fine-tuned on training data to select model parameters. Parameters were selected in initial fine-turning using 2 outcomes (legal intervention and drive-by). Because model parameters in each fine-tuned model were identical, these parameters were applied to each training data configuration (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Because our target outcomes are imbalanced, we add a weighted trainer to account for class imbalance. Configurations are summarized in Table S3 in Multimedia Appendix 1.</p><p>Classification performance was measured using learning curves, which plot performance metrics relative to differing splits of labeled training data to evaluate classifier model performance. Binary classification model metrics including precision and recall in addition to metrics considered useful for imbalanced class problems, including an <italic>F</italic><sub>1</sub>-score, were used. Finally, to analyze classification performance by subgroup, learning curves were created for sex, race, and ethnicity subgroups.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Classification outcomes differed by the proportion of positive to negative cases in each outcome (<xref ref-type="table" rid="table1">Table 1</xref>). The most rare positive class outcome was a police shooting (n=4489, 5.9%) followed by drive-by shootings (n=6575, 9.2%) and shootings where additional individuals were nonfatally shot (n=8052, 15.2%) in the course of the homicide. The most prevalent outcome was whether an individual is shot in their home (n=16,850, 24.8%) relative to another location outside the home. Victims of homicide in the sample tended to be male (n=4319-11,321; 67.2-96.2%), Black or African American (n=44,546-43,357, 58.5%&#x2010;60.5%), and young, with the most frequent age range between 25 and 34 years (<xref ref-type="table" rid="table1">Table 1</xref>). Intimate partner violence characterizes over a tenth of homicides overall but within cases where an individual is injured at home, intimate partner violence (n=17,226, 26.5%) occurred in over a quarter of cases. Legal intervention homicides were most likely associated with mental health problems and alcohol use.</p><p>Circumstances were known for almost all cases of legal intervention and drive-by shootings, but less information was known about the circumstances of homicide where additional individuals were shot or when they were injured at home (<xref ref-type="table" rid="table2">Table 2</xref>). Circumstances were known in 71% (n=30,774) of homicides of Black decedents in contrast to 83.7% (n=8698) among Hispanic and non-Hispanic White decedents. The median number of words in a narrative for a LE narrative ranged from 80&#x2010;83, whereas coroner and medical examiner narratives ranged from 88 to 91 words in length. Legal intervention homicides had the most lengthy narratives (115 for LE and 120 for coroner and medical examiner). Narrative length differed by race and sex. Among LE narratives, the median length for Black decedents was 98 words but 132 for non-Hispanic White decedents. Narrative length differed among male and female decedents. Female decedents had longer narratives for each homicide outcome. Female decedents shot at home had a median narrative length of 124 words in contrast to male decedents shot at home with a length of 92 words.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Sample descriptive statistics, characteristics by outcome.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom" colspan="3">Drive-by</td><td align="left" valign="bottom" colspan="3">Legal intervention</td><td align="left" valign="bottom" colspan="3">Number nonfatally shot</td><td align="left" valign="bottom" colspan="3">Individual injured at home</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Overall (n=71,708), n (%)</td><td align="left" valign="top">Negative case (n=65,133), n (%)</td><td align="left" valign="top">Positive case (n=6575), n (%)</td><td align="left" valign="top">Overall (n=76,197), n (%)</td><td align="left" valign="top">Negative case (n=71,708), n (%)</td><td align="left" valign="top">Positive case (n=4489), n (%)</td><td align="left" valign="top">Overall (n=53,024), n (%)</td><td align="left" valign="top">Negative case (n=44,972), n (%)</td><td align="left" valign="top">Positive case (n=8052), n (%)</td><td align="left" valign="top">Overall (n=68,016), n (%)</td><td align="left" valign="top">Negative case (n=51,166), n (%)</td><td align="left" valign="top">Positive case (n=16,850), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="13">Sex</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="char" char="." valign="top">11,255 (15.7)</td><td align="char" char="." valign="top">10,587 (16.3)</td><td align="char" char="." valign="top">668 (10.2)</td><td align="char" char="." valign="top">11,425 (15)</td><td align="char" char="." valign="top">11,255 (15.7)</td><td align="char" char="." valign="top">170 (3.8)</td><td align="char" char="." valign="top">8378 (15.8)</td><td align="char" char="." valign="top">7043 (15.7)</td><td align="char" char="." valign="top">1335 (16.6)</td><td align="char" char="." valign="top">10,744 (15.8)</td><td align="char" char="." valign="top">5215 (10.2)</td><td align="char" char="." valign="top">5529 (32.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="char" char="." valign="top">60,447 (84.3)</td><td align="char" char="." valign="top">54,540 (83.7)</td><td align="char" char="." valign="top">5907 (89.8)</td><td align="char" char="." valign="top">64,766 (85)</td><td align="char" char="." valign="top">60,447 (84.3)</td><td align="char" char="." valign="top">4319 (96.2)</td><td align="char" char="." valign="top">44,640 (84.2)</td><td align="char" char="." valign="top">37,923 (84.3)</td><td align="char" char="." valign="top">6717 (83.4)</td><td align="char" char="." valign="top">57,272 (84.2)</td><td align="char" char="." valign="top">45,951 (89.8)</td><td align="char" char="." valign="top">11,321 (67.2)</td></tr><tr><td align="left" valign="top" colspan="13">Race or ethnicity</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>American Indian or Alaska Native, non-Hispanic</td><td align="char" char="." valign="top">753 (1.1)</td><td align="char" char="." valign="top">712 (1.1)</td><td align="char" char="." valign="top">41 (0.6)</td><td align="char" char="." valign="top">897 (1.2)</td><td align="char" char="." valign="top">753 (1.1)</td><td align="char" char="." valign="top">144 (3.2)</td><td align="char" char="." valign="top">612 (1.2)</td><td align="char" char="." valign="top">544 (1.2)</td><td align="char" char="." valign="top">68 (0.8)</td><td align="char" char="." valign="top">717 (1.1)</td><td align="char" char="." valign="top">510 (1)</td><td align="char" char="." valign="top">207 (1.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Asian or Pacific Islander, non-Hispanic</td><td align="char" char="." valign="top">806 (1.1)</td><td align="char" char="." valign="top">768 (1.2)</td><td align="char" char="." valign="top">38 (0.6)</td><td align="char" char="." valign="top">868 (1.1)</td><td align="char" char="." valign="top">806 (1.1)</td><td align="char" char="." valign="top">62 (1.4)</td><td align="char" char="." valign="top">601 (1.1)</td><td align="char" char="." valign="top">513 (1.1)</td><td align="char" char="." valign="top">88 (1.1)</td><td align="char" char="." valign="top">773 (1.1)</td><td align="char" char="." valign="top">526 (1)</td><td align="char" char="." valign="top">247 (1.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Black or African American, non-Hispanic</td><td align="char" char="." valign="top">43,357 (60.5)</td><td align="char" char="." valign="top">38,976 (59.8)</td><td align="char" char="." valign="top">4381 (66.6)</td><td align="char" char="." valign="top">44,546 (58.5)</td><td align="char" char="." valign="top">43,357 (60.5)</td><td align="char" char="." valign="top">1189 (26.5)</td><td align="char" char="." valign="top">31,293 (59)</td><td align="char" char="." valign="top">25,925 (57.6)</td><td align="char" char="." valign="top">5368 (66.7)</td><td align="char" char="." valign="top">41,109 (60.4)</td><td align="char" char="." valign="top">33,639 (65.7)</td><td align="char" char="." valign="top">7470 (44.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hispanic</td><td align="char" char="." valign="top">10,388 (14.5)</td><td align="char" char="." valign="top">8745 (13.4)</td><td align="char" char="." valign="top">1643 (25)</td><td align="char" char="." valign="top">11,182 (14.7)</td><td align="char" char="." valign="top">10,388 (14.5)</td><td align="char" char="." valign="top">794 (17.7)</td><td align="char" char="." valign="top">8116 (15.3)</td><td align="char" char="." valign="top">6898 (15.3)</td><td align="char" char="." valign="top">1218 (15.1)</td><td align="char" char="." valign="top">9977 (14.7)</td><td align="char" char="." valign="top">8098 (15.8)</td><td align="char" char="." valign="top">1879 (11.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White, non-Hispanic</td><td align="char" char="." valign="top">15,457 (21.6)</td><td align="char" char="." valign="top">15,049 (23.1)</td><td align="char" char="." valign="top">408 (6.2)</td><td align="char" char="." valign="top">17,654 (23.2)</td><td align="char" char="." valign="top">15,457 (21.6)</td><td align="char" char="." valign="top">2197 (48.9)</td><td align="char" char="." valign="top">11,687 (22)</td><td align="char" char="." valign="top">10,476 (23.3)</td><td align="char" char="." valign="top">1211 (15)</td><td align="char" char="." valign="top">14,589 (21.4)</td><td align="char" char="." valign="top">7772 (15.2)</td><td align="char" char="." valign="top">6817 (40.5)</td></tr><tr><td align="left" valign="top" colspan="13">Age bins (years)</td></tr><tr><td align="char" char="hyphen" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>15-24</td><td align="char" char="." valign="top">19,403 (27.1)</td><td align="char" char="." valign="top">17,012 (26.1)</td><td align="char" char="." valign="top">2391 (36.4)</td><td align="char" char="." valign="top">20,052 (26.3)</td><td align="char" char="." valign="top">19,403 (27.1)</td><td align="char" char="." valign="top">649 (14.5)</td><td align="char" char="." valign="top">14,421 (27.2)</td><td align="char" char="." valign="top">11,642 (25.9)</td><td align="char" char="." valign="top">2779 (34.5)</td><td align="char" char="." valign="top">18,454 (27.1)</td><td align="char" char="." valign="top">15,811 (30.9)</td><td align="char" char="." valign="top">2643 (15.7)</td></tr><tr><td align="char" char="hyphen" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>25-34</td><td align="char" char="." valign="top">21,065 (29.4)</td><td align="char" char="." valign="top">18,981 (29.1)</td><td align="char" char="." valign="top">2084 (31.7)</td><td align="char" char="." valign="top">22,334 (29.3)</td><td align="char" char="." valign="top">21,065 (29.4)</td><td align="char" char="." valign="top">1269 (28.3)</td><td align="char" char="." valign="top">15,523 (29.3)</td><td align="char" char="." valign="top">13,106 (29.1)</td><td align="char" char="." valign="top">2417 (30)</td><td align="char" char="." valign="top">20,011 (29.4)</td><td align="char" char="." valign="top">16,324 (31.9)</td><td align="char" char="." valign="top">3687 (21.9)</td></tr><tr><td align="char" char="hyphen" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>35-44</td><td align="char" char="." valign="top">11,759 (16.4)</td><td align="char" char="." valign="top">10,907 (16.7)</td><td align="char" char="." valign="top">852 (13)</td><td align="char" char="." valign="top">12,758 (16.7)</td><td align="char" char="." valign="top">11,759 (16.4)</td><td align="char" char="." valign="top">999 (22.3)</td><td align="char" char="." valign="top">8595 (16.2)</td><td align="char" char="." valign="top">7510 (16.7)</td><td align="char" char="." valign="top">1085 (13.5)</td><td align="char" char="." valign="top">11,136 (16.4)</td><td align="char" char="." valign="top">8128 (15.9)</td><td align="char" char="." valign="top">3008 (17.9)</td></tr><tr><td align="char" char="hyphen" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>45-54</td><td align="char" char="." valign="top">6446 (9)</td><td align="char" char="." valign="top">6088 (9.3)</td><td align="char" char="." valign="top">358 (5.4)</td><td align="char" char="." valign="top">7074 (9.3)</td><td align="char" char="." valign="top">6446 (9)</td><td align="char" char="." valign="top">628 (14)</td><td align="char" char="." valign="top">4811 (9.1)</td><td align="char" char="." valign="top">4303 (9.6)</td><td align="char" char="." valign="top">508 (6.3)</td><td align="char" char="." valign="top">6097 (9)</td><td align="char" char="." valign="top">3716 (7.3)</td><td align="char" char="." valign="top">2381 (14.1)</td></tr><tr><td align="char" char="hyphen" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>55-64</td><td align="char" char="." valign="top">3545 (4.9)</td><td align="char" char="." valign="top">3378 (5.2)</td><td align="char" char="." valign="top">167 (2.5)</td><td align="char" char="." valign="top">3897 (5.1)</td><td align="char" char="." valign="top">3545 (4.9)</td><td align="char" char="." valign="top">352 (7.8)</td><td align="char" char="." valign="top">2591 (4.9)</td><td align="char" char="." valign="top">2333 (5.2)</td><td align="char" char="." valign="top">258 (3.2)</td><td align="char" char="." valign="top">3330 (4.9)</td><td align="char" char="." valign="top">1655 (3.2)</td><td align="char" char="." valign="top">1675 (9.9)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>65+</td><td align="char" char="." valign="top">2452 (3.4)</td><td align="char" char="." valign="top">2364 (3.6)</td><td align="char" char="." valign="top">88 (1.3)</td><td align="char" char="." valign="top">2601 (3.4)</td><td align="char" char="." valign="top">2452 (3.4)</td><td align="char" char="." valign="top">149 (3.3)</td><td align="char" char="." valign="top">1806 (3.4)</td><td align="char" char="." valign="top">1612 (3.6)</td><td align="char" char="." valign="top">194 (2.4)</td><td align="char" char="." valign="top">2309 (3.4)</td><td align="char" char="." valign="top">687 (1.3)</td><td align="char" char="." valign="top">1622 (9.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unknown</td><td align="char" char="." valign="top">6327 (8.8)</td><td align="char" char="." valign="top">5771 (8.9)</td><td align="char" char="." valign="top">556 (8.5)</td><td align="char" char="." valign="top">6766 (8.9)</td><td align="char" char="." valign="top">6327 (8.8)</td><td align="char" char="." valign="top">439 (9.8)</td><td align="char" char="." valign="top">4773 (9)</td><td align="char" char="." valign="top">4085 (9.1)</td><td align="char" char="." valign="top">688 (8.5)</td><td align="char" char="." valign="top">5999 (8.8)</td><td align="char" char="." valign="top">4486 (8.8)</td><td align="char" char="." valign="top">1513 (9)</td></tr><tr><td align="left" valign="top" colspan="13">Intimate partner violence</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No, not available, unknown</td><td align="char" char="." valign="top">64,071 (89.3)</td><td align="char" char="." valign="top">57,642 (88.5)</td><td align="char" char="." valign="top">6429 (97.8)</td><td align="char" char="." valign="top">68,095 (89.4)</td><td align="char" char="." valign="top">64,071 (89.3)</td><td align="char" char="." valign="top">4024 (89.6)</td><td align="char" char="." valign="top">47,096 (88.8)</td><td align="char" char="." valign="top">39,667 (88.2)</td><td align="char" char="." valign="top">7429 (92.3)</td><td align="char" char="." valign="top">60,535 (89)</td><td align="char" char="." valign="top">48,145 (94.1)</td><td align="char" char="." valign="top">12,390 (73.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="char" char="." valign="top">7637 (10.7)</td><td align="char" char="." valign="top">7491 (11.5)</td><td align="char" char="." valign="top">146 (2.2)</td><td align="char" char="." valign="top">8102 (10.6)</td><td align="char" char="." valign="top">7637 (10.7)</td><td align="char" char="." valign="top">465 (10.4)</td><td align="char" char="." valign="top">5928 (11.2)</td><td align="char" char="." valign="top">5305 (11.8)</td><td align="char" char="." valign="top">623 (7.7)</td><td align="char" char="." valign="top">7481 (11)</td><td align="char" char="." valign="top">3021 (5.9)</td><td align="char" char="." valign="top">4460 (26.5)</td></tr><tr><td align="left" valign="top" colspan="13">Mental health problem</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No, not available, unknown</td><td align="char" char="." valign="top">69,794 (97.3)</td><td align="char" char="." valign="top">63,323 (97.2)</td><td align="char" char="." valign="top">6471 (98.4)</td><td align="char" char="." valign="top">73,436 (96.4)</td><td align="char" char="." valign="top">69,794 (97.3)</td><td align="char" char="." valign="top">3642 (81.1)</td><td align="char" char="." valign="top">51,544 (97.2)</td><td align="char" char="." valign="top">43,633 (97)</td><td align="char" char="." valign="top">7911 (98.2)</td><td align="char" char="." valign="top">66,149 (97.3)</td><td align="char" char="." valign="top">50,059 (97.8)</td><td align="char" char="." valign="top">16,090 (95.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="char" char="." valign="top">1914 (2.7)</td><td align="char" char="." valign="top">1810 (2.8)</td><td align="char" char="." valign="top">104 (1.6)</td><td align="char" char="." valign="top">2761 (3.6)</td><td align="char" char="." valign="top">1914 (2.7)</td><td align="char" char="." valign="top">847 (18.9)</td><td align="char" char="." valign="top">1480 (2.8)</td><td align="char" char="." valign="top">1339 (3)</td><td align="char" char="." valign="top">141 (1.8)</td><td align="char" char="." valign="top">1867 (2.7)</td><td align="char" char="." valign="top">1107 (2.2)</td><td align="char" char="." valign="top">760 (4.5)</td></tr><tr><td align="left" valign="top" colspan="13">Alcohol result</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Not present</td><td align="char" char="." valign="top">29,990 (41.8)</td><td align="char" char="." valign="top">26,479 (40.7)</td><td align="char" char="." valign="top">3511 (53.4)</td><td align="char" char="." valign="top">31,916 (41.9)</td><td align="char" char="." valign="top">29,990 (41.8)</td><td align="char" char="." valign="top">1926 (42.9)</td><td align="char" char="." valign="top">23,045 (43.5)</td><td align="char" char="." valign="top">19,419 (43.2)</td><td align="char" char="." valign="top">3626 (45)</td><td align="char" char="." valign="top">29,385 (43.2)</td><td align="char" char="." valign="top">22,307 (43.6)</td><td align="char" char="." valign="top">7078 (42)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Present</td><td align="char" char="." valign="top">16,373 (22.8)</td><td align="char" char="." valign="top">14,834 (22.8)</td><td align="char" char="." valign="top">1539 (23.4)</td><td align="char" char="." valign="top">17,732 (23.3)</td><td align="char" char="." valign="top">16,373 (22.8)</td><td align="char" char="." valign="top">1359 (30.3)</td><td align="char" char="." valign="top">12,812 (24.2)</td><td align="char" char="." valign="top">10,658 (23.7)</td><td align="char" char="." valign="top">2154 (26.8)</td><td align="char" char="." valign="top">16,044 (23.6)</td><td align="char" char="." valign="top">12,691 (24.8)</td><td align="char" char="." valign="top">3353 (19.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unknown or not applicable</td><td align="char" char="." valign="top">25,345 (35.3)</td><td align="char" char="." valign="top">23,820 (36.6)</td><td align="char" char="." valign="top">1525 (23.2)</td><td align="char" char="." valign="top">26,549 (34.8)</td><td align="char" char="." valign="top">25,345 (35.3)</td><td align="char" char="." valign="top">1204 (26.8)</td><td align="char" char="." valign="top">17,167 (32.4)</td><td align="char" char="." valign="top">14,895 (33.1)</td><td align="char" char="." valign="top">2272 (28.2)</td><td align="char" char="." valign="top">22,587 (33.2)</td><td align="char" char="." valign="top">16,168 (31.6)</td><td align="char" char="." valign="top">6419 (38.1)</td></tr><tr><td align="left" valign="top" colspan="13">Argument</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No, not available, unknown</td><td align="char" char="." valign="top">54,036 (75.4)</td><td align="char" char="." valign="top">48,348 (74.2)</td><td align="char" char="." valign="top">5688 (86.5)</td><td align="char" char="." valign="top">57,841 (75.9)</td><td align="char" char="." valign="top">54,036 (75.4)</td><td align="char" char="." valign="top">3805 (84.8)</td><td align="char" char="." valign="top">39,380 (74.3)</td><td align="char" char="." valign="top">33,594 (74.7)</td><td align="char" char="." valign="top">5786 (71.9)</td><td align="char" char="." valign="top">50,790 (74.7)</td><td align="char" char="." valign="top">38,887 (76)</td><td align="char" char="." valign="top">11,903 (70.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="char" char="." valign="top">17,672 (24.6)</td><td align="char" char="." valign="top">16,785 (25.8)</td><td align="char" char="." valign="top">887 (13.5)</td><td align="char" char="." valign="top">18,356 (24.1)</td><td align="char" char="." valign="top">17,672 (24.6)</td><td align="char" char="." valign="top">684 (15.2)</td><td align="char" char="." valign="top">13,644 (25.7)</td><td align="char" char="." valign="top">11,378 (25.3)</td><td align="char" char="." valign="top">2266 (28.1)</td><td align="char" char="." valign="top">17,226 (25.3)</td><td align="char" char="." valign="top">12,279 (24)</td><td align="char" char="." valign="top">4947 (29.4)</td></tr></tbody></table></table-wrap><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Narrative descriptive statistics, characteristics by outcome.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom" colspan="3">Drive-by</td><td align="left" valign="bottom" colspan="3">Legal intervention</td><td align="left" valign="bottom" colspan="3">Number nonfatally shot</td><td align="left" valign="bottom" colspan="3">Individual injured at home</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Total cases</td><td align="left" valign="top">Negative class cases</td><td align="left" valign="top">Positive class cases</td><td align="left" valign="top">Total cases</td><td align="left" valign="top">Negative class cases</td><td align="left" valign="top">Positive class cases</td><td align="left" valign="top">Total cases</td><td align="left" valign="top">Negative class cases</td><td align="left" valign="top">Positive class cases</td><td align="left" valign="top">Total cases</td><td align="left" valign="top">Negative class cases</td><td align="left" valign="top">Positive class cases</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="13">Panel A: overall</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Circumstances known, n (%)</td><td align="char" char="." valign="top">54,354 (75.8)</td><td align="char" char="." valign="top">47,779 (73.4)</td><td align="char" char="." valign="top">6575 (100)</td><td align="char" char="." valign="top">58,745 (77.1)</td><td align="char" char="." valign="top">54,354 (75.8)</td><td align="char" char="." valign="top">4391 (97.8)</td><td align="char" char="." valign="top">41,794 (78.8)</td><td align="char" char="." valign="top">34,852 (77.5)</td><td align="char" char="." valign="top">6942 (86.2)</td><td align="char" char="." valign="top">52,852 (77.7)</td><td align="char" char="." valign="top">38,879 (76)</td><td align="char" char="." valign="top">13,973 (82.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words law enforcement narrative, median (IQR)</td><td align="char" char="." valign="top">80.0 (37.0-137.0)</td><td align="char" char="." valign="top">78.0 (35.0-136.0)</td><td align="char" char="." valign="top">93.0 (53.0-148.0)</td><td align="char" char="." valign="top">81.0 (37.0-141.0)</td><td align="char" char="." valign="top">80.0 (37.0-137.0)</td><td align="char" char="." valign="top">115.0 (44.0-206.0)</td><td align="char" char="." valign="top">83.0 (41.0-141.0)</td><td align="char" char="." valign="top">78.0 (39.0-134.0)</td><td align="char" char="." valign="top">113.0 (65.0-173.0)</td><td align="char" char="." valign="top">83.0 (40.0-141.0)</td><td align="char" char="." valign="top">80.0 (39.0-132.0)</td><td align="char" char="." valign="top">94.0 (45.0-168.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words CME<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> narrative, median (IQR)</td><td align="char" char="." valign="top">89.0 (55.0-138.0)</td><td align="char" char="." valign="top">88.0 (54.0-137.0)</td><td align="char" char="." valign="top">100.0 (64.0-147.0)</td><td align="char" char="." valign="top">90.0 (56.0-141.0)</td><td align="char" char="." valign="top">89.0 (55.0-138.0)</td><td align="char" char="." valign="top">120.0 (78.0-182.0)</td><td align="char" char="." valign="top">88.0 (56.0-137.0)</td><td align="char" char="." valign="top">84.0 (53.0-132.0)</td><td align="char" char="." valign="top">108.0 (72.0-163.0)</td><td align="char" char="." valign="top">91.0 (58.0-140.0)</td><td align="char" char="." valign="top">89.0 (56.0-134.0)</td><td align="char" char="." valign="top">100.0 (62.0-161.0)</td></tr><tr><td align="left" valign="top" colspan="13">Panel B: Black</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Circumstances known, n (%)</td><td align="char" char="." valign="top">30,774 (71)</td><td align="char" char="." valign="top">26,393 (67.7)</td><td align="char" char="." valign="top">4381 (100)</td><td align="char" char="." valign="top">31,930 (71.7)</td><td align="char" char="." valign="top">30,774 (71)</td><td align="char" char="." valign="top">1156 (97.2)</td><td align="char" char="." valign="top">23,041 (73.6)</td><td align="char" char="." valign="top">18,562 (71.6)</td><td align="char" char="." valign="top">4479 (83.4)</td><td align="char" char="." valign="top">29,875 (72.7)</td><td align="char" char="." valign="top">24,054 (71.5)</td><td align="char" char="." valign="top">5821 (77.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words law enforcement narrative, median (IQR)</td><td align="char" char="." valign="top">77.0 (38.0-125.0)</td><td align="char" char="." valign="top">74.0 (36.0-121.0)</td><td align="char" char="." valign="top">98.0 (59.0-152.0)</td><td align="char" char="." valign="top">77.0 (38.0-126.0)</td><td align="char" char="." valign="top">77.0 (38.0-125.0)</td><td align="char" char="." valign="top">98.0 (41.0-169.0)</td><td align="char" char="." valign="top">80.0 (42.0-126.0)</td><td align="char" char="." valign="top">74.0 (39.0-118.0)</td><td align="char" char="." valign="top">109.0 (66.5-160.0)</td><td align="char" char="." valign="top">80.0 (41.0-127.0)</td><td align="char" char="." valign="top">79.0 (41.0-125.0)</td><td align="char" char="." valign="top">82.0 (43.0-137.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words CME narrative, median (IQR)</td><td align="char" char="." valign="top">85.0 (55.0-126.0)</td><td align="char" char="." valign="top">83.0 (53.0-123.0)</td><td align="char" char="." valign="top">105.0 (72.0-151.0)</td><td align="char" char="." valign="top">86.0 (55.0-127.0)</td><td align="char" char="." valign="top">85.0 (55.0-126.0)</td><td align="char" char="." valign="top">105.0 (73.0-150.0)</td><td align="char" char="." valign="top">84.0 (56.0-125.0)</td><td align="char" char="." valign="top">80.0 (53.0-119.0)</td><td align="char" char="." valign="top">106.0 (72.0-155.0)</td><td align="char" char="." valign="top">87.0 (57.0-128.0)</td><td align="char" char="." valign="top">87.0 (57.0-126.0)</td><td align="char" char="." valign="top">89.0 (58.0-137.0)</td></tr><tr><td align="left" valign="top" colspan="13">Panel C: Hispanic</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Circumstances known, n (%)</td><td align="char" char="." valign="top">8698 (83.7)</td><td align="char" char="." valign="top">7055 (80.7)</td><td align="char" char="." valign="top">1643 (100)</td><td align="char" char="." valign="top">9487 (84.8)</td><td align="char" char="." valign="top">8698 (83.7)</td><td align="char" char="." valign="top">789 (99.4)</td><td align="char" char="." valign="top">7107 (87.6)</td><td align="char" char="." valign="top">5980 (86.7)</td><td align="char" char="." valign="top">1127 (92.5)</td><td align="char" char="." valign="top">8521 (85.4)</td><td align="char" char="." valign="top">6906 (85.3)</td><td align="char" char="." valign="top">1615 (85.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words law enforcement narrative, median (IQR)</td><td align="char" char="." valign="top">67.0 (29.0-136.0)</td><td align="char" char="." valign="top">66.0 (24.0-139.0)</td><td align="char" char="." valign="top">72.0 ( 40.0-129.0)</td><td align="char" char="." valign="top">69.0 (28.0-144.0)</td><td align="char" char="." valign="top">67.0 (29.0-136.0)</td><td align="char" char="." valign="top">132.0 (13.0-248.0)</td><td align="char" char="." valign="top">69.0 (34.0-142.0)</td><td align="char" char="." valign="top">65.0 (32.0-132.0)</td><td align="char" char="." valign="top">106.0 (52.0-186.0)</td><td align="char" char="." valign="top">69.0 (31.0-139.0)</td><td align="char" char="." valign="top">66.0 (31.0-130.0)</td><td align="char" char="." valign="top">89.0 (32.0-186.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words CME narrative, median (IQR)</td><td align="char" char="." valign="top">87.0 (45.0-146.0)</td><td align="char" char="." valign="top">88.0 (48.0-149.0)</td><td align="char" char="." valign="top">79.0 (31.0-134.0)</td><td align="char" char="." valign="top">90.5 (47.0-151.0)</td><td align="char" char="." valign="top">87.0 (45.0-146.0)</td><td align="char" char="." valign="top">139.0 (90.0-206.0)</td><td align="char" char="." valign="top">79.0 (41.0-141.0)</td><td align="char" char="." valign="top">75.0 (39.0-134.0)</td><td align="char" char="." valign="top">106.0 (60.0-170.0)</td><td align="char" char="." valign="top">88.0 (47.0-147.0)</td><td align="char" char="." valign="top">83.0 (44.0-141.0)</td><td align="char" char="." valign="top">111.0 (66.0-180.0)</td></tr><tr><td align="left" valign="top" colspan="13">Panel D: White</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Circumstances known, n (%)</td><td align="char" char="." valign="top">12,860 (83.2)</td><td align="char" char="." valign="top">12,452 (82.7)</td><td align="char" char="." valign="top">408 (100)</td><td align="char" char="." valign="top">15,003 (85)</td><td align="char" char="." valign="top">12,860 (83.2)</td><td align="char" char="." valign="top">2143 (97.5)</td><td align="char" char="." valign="top">10,052 (86)</td><td align="char" char="." valign="top">8952 (85.5)</td><td align="char" char="." valign="top">1100 (90.8)</td><td align="char" char="." valign="top">12,496 (85.7)</td><td align="char" char="." valign="top">6537 (84.1)</td><td align="char" char="." valign="top">5959 (87.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words law enforcement narrative, median (IQR)</td><td align="char" char="." valign="top">99.0 (42.0-182.0)</td><td align="char" char="." valign="top">99.0 (41.0-181.0)</td><td align="char" char="." valign="top">112.0 (64.5-186.5)</td><td align="char" char="." valign="top">102.0 (43.0-186.0)</td><td align="char" char="." valign="top">99.0 (42.0-182.0)</td><td align="char" char="." valign="top">119.0 (53.0-212.0)</td><td align="char" char="." valign="top">105.0 (48.0-186.0)</td><td align="char" char="." valign="top">101.0 (47.0-181.0)</td><td align="char" char="." valign="top">134.0 (76.0-232.0)</td><td align="char" char="." valign="top">104.0 (47.0-187.0)</td><td align="char" char="." valign="top">97.0 (43.0-172.0)</td><td align="char" char="." valign="top">114.0 (52.0-205.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words CME narrative, median (IQR)</td><td align="char" char="." valign="top">103.0 (61.0-165.0)</td><td align="char" char="." valign="top">102.0 (61.0-165.0)</td><td align="char" char="." valign="top">107.0 (71.5-149.5)</td><td align="char" char="." valign="top">105.0 (63.0-167.0)</td><td align="char" char="." valign="top">103.0 (61.0-165.0)</td><td align="char" char="." valign="top">122.0 (77.0-184.0)</td><td align="char" char="." valign="top">104.0 (64.0-166.0)</td><td align="char" char="." valign="top">103.0 (63.0-163.0)</td><td align="char" char="." valign="top">118.0 (74.0-191.0)</td><td align="char" char="." valign="top">106.0 (65.0-168.0)</td><td align="char" char="." valign="top">101.0 (63.0-156.0)</td><td align="char" char="." valign="top">112.0 (68.0-183.0)</td></tr><tr><td align="left" valign="top" colspan="13">Panel E: female</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Circumstances known, n (%)</td><td align="char" char="." valign="top">9404 (83.6)</td><td align="char" char="." valign="top">8736 (82.5)</td><td align="char" char="." valign="top">668 (100)</td><td align="char" char="." valign="top">9567 (83.7)</td><td align="char" char="." valign="top">9404 (83.6)</td><td align="char" char="." valign="top">163 (95.9)</td><td align="char" char="." valign="top">7242 (86.4)</td><td align="char" char="." valign="top">6065 (86.1)</td><td align="char" char="." valign="top">1177 (88.2)</td><td align="char" char="." valign="top">9169 (85.3)</td><td align="char" char="." valign="top">4271 (81.9)</td><td align="char" char="." valign="top">4898 (88.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words law enforcement narrative, median (IQR)</td><td align="char" char="." valign="top">104.0 (48.0-184.0)</td><td align="char" char="." valign="top">104.0 (46.0-186.0)</td><td align="char" char="." valign="top">109.0 (60.0-158.5)</td><td align="char" char="." valign="top">104.0 (48.0-184.0)</td><td align="char" char="." valign="top">104.0 (48.0-184.0)</td><td align="char" char="." valign="top">121.5 (48.0-193.0)</td><td align="char" char="." valign="top">109.0 (54.0-189.0)</td><td align="char" char="." valign="top">106.0 (51.0-186.0)</td><td align="char" char="." valign="top">125.0 (69.0-204.0)</td><td align="char" char="." valign="top">107.0 (51.5-187.0)</td><td align="char" char="." valign="top">97.0 (47.0-165.0)</td><td align="char" char="." valign="top">118.0 (57.0-207.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words CME narrative, median (IQR)</td><td align="char" char="." valign="top">111.0 (68.0-181.0)</td><td align="char" char="." valign="top">112.0 (68.0-183.0)</td><td align="char" char="." valign="top">105.0 (67.0-152.5)</td><td align="char" char="." valign="top">111.0 (68.0-181.0)</td><td align="char" char="." valign="top">111.0 (68.0-181.0)</td><td align="char" char="." valign="top">134.0 (90.0-198.0)</td><td align="char" char="." valign="top">112.5 (71.0-181.0)</td><td align="char" char="." valign="top">111.0 (69.0-181.0)</td><td align="char" char="." valign="top">122.0 (80.0-188.0)</td><td align="char" char="." valign="top">113.0 (71.0-184.0)</td><td align="char" char="." valign="top">105.0 (66.0-165.0)</td><td align="char" char="." valign="top">124.0 (75.0-198.0)</td></tr><tr><td align="left" valign="top" colspan="13">Panel F: male</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Circumstances known, n (%)</td><td align="char" char="." valign="top">44,950 (74.4)</td><td align="char" char="." valign="top">39,043 (71.6)</td><td align="char" char="." valign="top">5907 (100)</td><td align="char" char="." valign="top">49,178 (75.9)</td><td align="char" char="." valign="top">44,950 (74.4)</td><td align="char" char="." valign="top">4228 (97.9)</td><td align="char" char="." valign="top">34,552 (77.4)</td><td align="char" char="." valign="top">28,787 (75.9)</td><td align="char" char="." valign="top">5765 (85.8)</td><td align="char" char="." valign="top">43,683 (76.3)</td><td align="char" char="." valign="top">34,608 (75.3)</td><td align="char" char="." valign="top">9075 (80.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words law enforcement narrative, median (IQR)</td><td align="char" char="." valign="top">76.0 (35.0-130.0)</td><td align="char" char="." valign="top">74.0 (34.0-128.0)</td><td align="char" char="." valign="top">92.0 (52.0-147.0)</td><td align="char" char="." valign="top">78.0 (36.0-134.0)</td><td align="char" char="." valign="top">76.0 (35.0-130.0)</td><td align="char" char="." valign="top">114.0 (44.0-207.0)</td><td align="char" char="." valign="top">79.0 (40.0-133.0)</td><td align="char" char="." valign="top">74.0 (37.0-125.0)</td><td align="char" char="." valign="top">110.0 (64.0-168.0)</td><td align="char" char="." valign="top">79.0 (39.0-133.0)</td><td align="char" char="." valign="top">78.0 (39.0-129.0)</td><td align="char" char="." valign="top">85.0 (41.0-150.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Words CME narrative, median (IQR)</td><td align="char" char="." valign="top">86.0 (53.0-131.0)</td><td align="char" char="." valign="top">84.0 (53.0-129.0)</td><td align="char" char="." valign="top">100.0 (64.0-146.0)</td><td align="char" char="." valign="top">87.0 (55.0-135.0)</td><td align="char" char="." valign="top">86.0 (53.0-131.0)</td><td align="char" char="." valign="top">120.0 (78.0-181.0)</td><td align="char" char="." valign="top">84.0 (54.0-129.0)</td><td align="char" char="." valign="top">81.0 (52.0-124.0)</td><td align="char" char="." valign="top">106.0 (70.0-158.0)</td><td align="char" char="." valign="top">88.0 (56.0-133.0)</td><td align="char" char="." valign="top">87.0 (55.0-131.0)</td><td align="char" char="." valign="top">92.0 ( 58.0-145.0)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>CME: coroner and medical examiner.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table3">Table 3</xref> displays classification performance by <italic>F</italic><sub>1</sub>-score for each model type. Training data of approximately 1500 cases achieved an <italic>F</italic><sub>1</sub>-score of at least 0.6 for each outcome, though at 1000 cases, the majority of outcomes was at or exceeding 0.6. The exception was the number nonfatally shot. <xref ref-type="fig" rid="figure1">Figure 1</xref> plots learning curves by <italic>F</italic><sub>1</sub>-score in <xref ref-type="table" rid="table3">Table 3</xref>. Replacement language models tended to perform best (<xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="fig" rid="figure1">Figure 1</xref>) with the highest <italic>F</italic><sub>1</sub>-score in all save 6 model interactions. In particular, language replacement models consistently obtained the highest <italic>F</italic><sub>1</sub>-score for legal intervention homicides (<xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="fig" rid="figure1">Figure 1</xref>). Similarly, language replacement models featured higher precision scores for a subset of outcomes (<xref ref-type="fig" rid="figure2">Figure 2</xref> and Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Less substantial difference occurred with recall (<xref ref-type="fig" rid="figure3">Figure 3</xref> and Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and the false negative rate (<xref ref-type="fig" rid="figure4">Figure 4</xref>) between models. Omitting coroner or medical examiner reports performed worse across outcomes (<xref ref-type="fig" rid="figure1">Figures 1</xref><xref ref-type="fig" rid="figure2"/><xref ref-type="fig" rid="figure3"/>-<xref ref-type="fig" rid="figure4">4</xref>). Language replacement models trained on 1500&#x2010;2000 narratives obtained low false negative rates ranging from 1% to 5% of true cases resulting in a misclassified outcome (<xref ref-type="fig" rid="figure1">Figure 1</xref> and Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p><italic>F</italic><sub>1</sub>-scores by model outcome, training data, and model type.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Outcome</td><td align="left" valign="bottom">Train, n</td><td align="left" valign="bottom">DistilBERT<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>, <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">DistilBERT+LE<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> only<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>, <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">DistilBERT+language<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup>, <italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">100</td><td align="left" valign="top"><italic>0.219<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></italic></td><td align="left" valign="top">0.168</td><td align="left" valign="top">0.209</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">200</td><td align="left" valign="top"><italic>0.232</italic></td><td align="left" valign="top">0.148</td><td align="left" valign="top">0.232</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">500</td><td align="left" valign="top">0.381</td><td align="left" valign="top">0.144</td><td align="left" valign="top"><italic>0.473</italic></td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">1000</td><td align="left" valign="top"><italic>0.626</italic></td><td align="left" valign="top">0.124</td><td align="left" valign="top">0.606</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.619</td><td align="left" valign="top">0.126</td><td align="left" valign="top"><italic>0.623</italic></td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">2000</td><td align="left" valign="top">0.593</td><td align="left" valign="top">0.126</td><td align="left" valign="top"><italic>0.635</italic></td></tr><tr><td align="left" valign="top">Police shooting</td><td align="left" valign="top">100</td><td align="left" valign="top">0.231</td><td align="left" valign="top">0.105</td><td align="left" valign="top"><italic>0.305</italic></td></tr><tr><td align="left" valign="top">Police shooting</td><td align="left" valign="top">200</td><td align="left" valign="top">0.218</td><td align="left" valign="top">0.083</td><td align="left" valign="top"><italic>0.364</italic></td></tr><tr><td align="left" valign="top">Police shooting</td><td align="left" valign="top">500</td><td align="left" valign="top">0.490</td><td align="left" valign="top">0.083</td><td align="left" valign="top"><italic>0.653</italic></td></tr><tr><td align="left" valign="top">Police shooting</td><td align="left" valign="top">1000</td><td align="left" valign="top">0.739</td><td align="left" valign="top">0.064</td><td align="left" valign="top"><italic>0.795</italic></td></tr><tr><td align="left" valign="top">Police shooting</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.771</td><td align="left" valign="top">0.056</td><td align="left" valign="top"><italic>0.856</italic></td></tr><tr><td align="left" valign="top">Police shooting</td><td align="left" valign="top">2000</td><td align="left" valign="top">0.770</td><td align="left" valign="top">0.080</td><td align="left" valign="top"><italic>0.833</italic></td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">100</td><td align="left" valign="top"><italic>0.319</italic></td><td align="left" valign="top">0.246</td><td align="left" valign="top">0.312</td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">200</td><td align="left" valign="top">0.281</td><td align="left" valign="top">0.226</td><td align="left" valign="top"><italic>0.286</italic></td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">500</td><td align="left" valign="top">0.341</td><td align="left" valign="top">0.192</td><td align="left" valign="top"><italic>0.345</italic></td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">1000</td><td align="left" valign="top">0.352</td><td align="left" valign="top">0.220</td><td align="left" valign="top"><italic>0.413</italic></td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.591</td><td align="left" valign="top">0.182</td><td align="left" valign="top"><italic>0.642</italic></td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">2000</td><td align="left" valign="top">0.608</td><td align="left" valign="top">0.195</td><td align="left" valign="top"><italic>0.663</italic></td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">100</td><td align="left" valign="top">0.547</td><td align="left" valign="top">0.222</td><td align="left" valign="top"><italic>0.574</italic></td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">200</td><td align="left" valign="top">0.578</td><td align="left" valign="top">0.283</td><td align="left" valign="top"><italic>0.629</italic></td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">500</td><td align="left" valign="top">0.665</td><td align="left" valign="top">0.277</td><td align="left" valign="top"><italic>0.714</italic></td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">1000</td><td align="left" valign="top"><italic>0.722</italic></td><td align="left" valign="top">0.294</td><td align="left" valign="top">0.697</td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.737</td><td align="left" valign="top">0.280</td><td align="left" valign="top"><italic>0.749</italic></td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">2000</td><td align="left" valign="top"><italic>0.744</italic></td><td align="left" valign="top">0.286</td><td align="left" valign="top">0.739</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>The base distilBERT model.</p></fn><fn id="table3fn2"><p><sup>b</sup>LE: law enforcement.</p></fn><fn id="table3fn3"><p><sup>c</sup>The distilBERT model trained only on LE narratives.</p></fn><fn id="table3fn4"><p><sup>d</sup>The distilBERT model where text replacement for uncommon language in the narratives is replaced for clarify.</p></fn><fn id="table3fn5"><p><sup>e</sup>Values in italics format correspond to the best <italic>F</italic><sub>1</sub>-score across the listed models.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Learning curve by outcome, model type<italic>&#x2014;F</italic><sub>1</sub>-score. <italic>F</italic><sub>1</sub>-scores are plotted for distilBERT models, distilBERT models with language replacement, and models that do not use LE narratives. Training data randomly sampled and corresponding to amounts of 100, 200, 500, 1000, 1500, and 2000 randomly sampled training datasets are plotted according to each model performance metric. Test sets reporting results are identical across models for each outcome variable. LE: law enforcement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68212_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Learning curve by outcome, model type&#x2014;precision. Precision scores are plotted for distilBERT models, distilBERT models with language replacement, and models that do not use LE narratives. Training data randomly sampled and corresponding to amounts of 100, 200, 500, 1000, 1500, and 2000 randomly sampled training datasets are plotted according to each model performance metric. Test sets reporting results are identical across models for each outcome variable. LE: law enforcement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68212_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Learning curve by outcome, model type&#x2014;recall. Recall scores are plotted for distilBERT models, distilBERT models with language replacement, and models that do not use LE narratives. Training data randomly sampled and corresponding to amounts of 100, 200, 500, 1000, 1500, and 2000 randomly sampled training datasets are plotted according to each model performance metric. Test sets reporting results are identical across models for each outcome variable. LE: law enforcement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68212_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Learning curve by outcome, model type&#x2014;false negative rate. False negative scores are plotted for distilBERT models, distilBERT models with language replacement, and models that do not use LE narratives. Training data randomly sampled and corresponding to amounts of 100, 200, 500, 1000, 1500, and 2000 randomly sampled training datasets are plotted according to each model performance metric. Test sets reporting results are identical across models for each outcome variable. LE: law enforcement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68212_fig04.png"/></fig><p>Oversampling positive class cases was negligibly helpful in improving <italic>F</italic><sub>1</sub>-scores (<xref ref-type="fig" rid="figure5">Figure 5</xref>). For instance, oversampling for legal intervention homicide to be composed of 20% positive class cases resulted in the addition of 580 positive class cases added to training data and an <italic>F</italic><sub>1</sub>-score of 0.795 (Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and <xref ref-type="fig" rid="figure5">Figure 5</xref>). Relative to adding 500 randomly sampled cases, which would result in an <italic>F</italic><sub>1</sub>-score of 0.771 (Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), the <italic>F</italic><sub>1</sub>-score gain from oversampling was 0.024 (0.795-0.771) and therefore modest.</p><p><xref ref-type="fig" rid="figure6">Figure 6</xref> plots <italic>F</italic><sub>1</sub>-scores of distilBERT language replacement models, as these models tended to perform best overall and may capture linguistic differences most accurately across subgroups. Predictions differ by race or ethnicity and sex across models. Legal intervention homicide victims who were White or Hispanic were most often correctly classified as such, and Black decedents were least likely to be correctly classified (<xref ref-type="fig" rid="figure6">Figure 6</xref> and <xref ref-type="table" rid="table4">Table 4</xref>). The prediction difference is substantial for legal intervention victims with lower amounts of training data, though the gap persisted with higher volumes of training data. White decedents shot at home were most often correctly predicted, while Black and Hispanic decedents were least likely. Female decedents were less often correctly predicted than male decedents except if they were shot at home (<xref ref-type="fig" rid="figure7">Figure 7</xref>). Among models with at least 1500 records of training data, <italic>F</italic><sub>1</sub>-score disparities ranged from 0.2 to 0.25 by race and ethnicity, and between male and female decedents with differences ranging from 0.12 to 0.2 (<xref ref-type="table" rid="table4">Table 4</xref>).</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p><italic>F</italic><sub>1</sub>-learning curve for oversampled positive class cases versus baseline language replacement model. <italic>F</italic><sub>1</sub>-scores are plotted for distilBERT models fit with language replacement for both randomly sampled training data and oversampled training data. Oversampled training data correspond to an increment of a 10% increase in the proportion of positive class cases included in training data. The exact training dataset counts are in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Random train data is plotted at 1000, 1500, and 2000 randomly sampled training data records for reference.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68212_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p><italic>F</italic><sub>1</sub>-learning curves for distilBERT+language models by race and ethnicity. <italic>F</italic><sub>1</sub>-scores are plotted for distilBERT models with language replacement for each outcome by race or ethnicity. Training data randomly sampled and corresponding to amounts of 100, 200, 500, 1000, 1500, and 2000. Test sets reporting results are identical across models for each outcome variable.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68212_fig06.png"/></fig><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Classification performance for language replacement models by outcome by subgroup<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Train<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>, n</td><td align="left" valign="bottom">White, <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Black, <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Hispanic, <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Male, <italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Female, <italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">100</td><td align="left" valign="top">0.208</td><td align="left" valign="top">0.353</td><td align="left" valign="top">0.322</td><td align="left" valign="top">0.317</td><td align="left" valign="top">0.292</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">200</td><td align="left" valign="top">0.204</td><td align="left" valign="top">0.321</td><td align="left" valign="top">0.280</td><td align="left" valign="top">0.292</td><td align="left" valign="top">0.266</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">500</td><td align="left" valign="top">0.237</td><td align="left" valign="top">0.390</td><td align="left" valign="top">0.346</td><td align="left" valign="top">0.356</td><td align="left" valign="top">0.302</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">1000</td><td align="left" valign="top">0.321</td><td align="left" valign="top">0.447</td><td align="left" valign="top">0.380</td><td align="left" valign="top">0.416</td><td align="left" valign="top">0.401</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.536</td><td align="left" valign="top">0.675</td><td align="left" valign="top">0.607</td><td align="left" valign="top">0.651</td><td align="left" valign="top">0.600</td></tr><tr><td align="left" valign="top">Drive-by</td><td align="left" valign="top">2000</td><td align="left" valign="top">0.554</td><td align="left" valign="top">0.704</td><td align="left" valign="top">0.608</td><td align="left" valign="top">0.672</td><td align="left" valign="top">0.622</td></tr><tr><td align="left" valign="top">Legal intervention</td><td align="left" valign="top">100</td><td align="left" valign="top">0.383</td><td align="left" valign="top">0.169</td><td align="left" valign="top">0.324</td><td align="left" valign="top">0.336</td><td align="left" valign="top">0.085</td></tr><tr><td align="left" valign="top">Legal intervention</td><td align="left" valign="top">200</td><td align="left" valign="top">0.475</td><td align="left" valign="top">0.162</td><td align="left" valign="top">0.411</td><td align="left" valign="top">0.391</td><td align="left" valign="top">0.138</td></tr><tr><td align="left" valign="top">Legal intervention</td><td align="left" valign="top">500</td><td align="left" valign="top">0.746</td><td align="left" valign="top">0.473</td><td align="left" valign="top">0.722</td><td align="left" valign="top">0.678</td><td align="left" valign="top">0.343</td></tr><tr><td align="left" valign="top">Legal intervention</td><td align="left" valign="top">1000</td><td align="left" valign="top">0.812</td><td align="left" valign="top">0.723</td><td align="left" valign="top">0.816</td><td align="left" valign="top">0.817</td><td align="left" valign="top">0.495</td></tr><tr><td align="left" valign="top">Legal intervention</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.852</td><td align="left" valign="top">0.830</td><td align="left" valign="top">0.893</td><td align="left" valign="top">0.870</td><td align="left" valign="top">0.632</td></tr><tr><td align="left" valign="top">Legal intervention</td><td align="left" valign="top">2000</td><td align="left" valign="top">0.833</td><td align="left" valign="top">0.793</td><td align="left" valign="top">0.873</td><td align="left" valign="top">0.842</td><td align="left" valign="top">0.672</td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">100</td><td align="left" valign="top">0.075</td><td align="left" valign="top">0.248</td><td align="left" valign="top">0.238</td><td align="left" valign="top">0.226</td><td align="left" valign="top">0.127</td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">200</td><td align="left" valign="top">0.073</td><td align="left" valign="top">0.290</td><td align="left" valign="top">0.268</td><td align="left" valign="top">0.255</td><td align="left" valign="top">0.131</td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">500</td><td align="left" valign="top">0.239</td><td align="left" valign="top">0.478</td><td align="left" valign="top">0.604</td><td align="left" valign="top">0.479</td><td align="left" valign="top">0.431</td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">1000</td><td align="left" valign="top">0.382</td><td align="left" valign="top">0.613</td><td align="left" valign="top">0.693</td><td align="left" valign="top">0.613</td><td align="left" valign="top">0.558</td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.412</td><td align="left" valign="top">0.629</td><td align="left" valign="top">0.688</td><td align="left" valign="top">0.626</td><td align="left" valign="top">0.602</td></tr><tr><td align="left" valign="top">Number nonfatally shot</td><td align="left" valign="top">2000</td><td align="left" valign="top">0.425</td><td align="left" valign="top">0.640</td><td align="left" valign="top">0.712</td><td align="left" valign="top">0.639</td><td align="left" valign="top">0.610</td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">100</td><td align="left" valign="top">0.679</td><td align="left" valign="top">0.488</td><td align="left" valign="top">0.543</td><td align="left" valign="top">0.505</td><td align="left" valign="top">0.724</td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">200</td><td align="left" valign="top">0.733</td><td align="left" valign="top">0.552</td><td align="left" valign="top">0.605</td><td align="left" valign="top">0.566</td><td align="left" valign="top">0.783</td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">500</td><td align="left" valign="top">0.785</td><td align="left" valign="top">0.660</td><td align="left" valign="top">0.680</td><td align="left" valign="top">0.665</td><td align="left" valign="top">0.824</td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">1000</td><td align="left" valign="top">0.784</td><td align="left" valign="top">0.637</td><td align="left" valign="top">0.649</td><td align="left" valign="top">0.642</td><td align="left" valign="top">0.826</td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">1500</td><td align="left" valign="top">0.821</td><td align="left" valign="top">0.709</td><td align="left" valign="top">0.663</td><td align="left" valign="top">0.703</td><td align="left" valign="top">0.851</td></tr><tr><td align="left" valign="top">Individual injured at home</td><td align="left" valign="top">2000</td><td align="left" valign="top">0.805</td><td align="left" valign="top">0.696</td><td align="left" valign="top">0.683</td><td align="left" valign="top">0.699</td><td align="left" valign="top">0.828</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup><italic>F</italic><sub>1</sub>-scores are listed for distilBERT models with language replacement across target outcomes within subgroups including race, ethnicity, and sex. Test sets reporting results are identical across models for each outcome variable.</p></fn><fn id="table4fn2"><p><sup>b</sup>Training data randomly sampled and corresponding to amounts of 100, 200, 500, 1000, 1500, and 2000.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure7"><label>Figure 7.</label><caption><p><italic>F</italic><sub>1</sub>-learning curves for distilBERT+language models by sex. <italic>F</italic><sub>1</sub>-scores are plotted for distilBERT models with language replacement for each outcome by sex. Training data randomly sampled and corresponding to amounts of 100, 200, 500, 1000, 1500, and 2000. Test sets reporting results are identical across models for each outcome variable.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68212_fig07.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This analysis simulated the NLP model-fitting process to demonstrate how different training and preprocessing decisions impact model performance in the supervised classification of violent death narratives. Results show that the compact LLM approach is useful for predicting rare NVDRS outcomes relative to naive prediction baselines. The best model for drive-by shootings achieved an <italic>F</italic><sub>1</sub>-score of 0.635 (<xref ref-type="table" rid="table3">Table 3</xref>) for an outcome, where the proportion of positive class cases was 9.2%. For context, if the model had correctly classified only the 9.2% of positive class cases, it would have achieved an <italic>F</italic><sub>1</sub>-score of 0.162. While variation exists across outcomes in the rate of improvement over a naive prediction, the improvement in <italic>F</italic><sub>1</sub>-scores across infrequent NVDRS events demonstrates that a compact LLM approach is useful.</p><p>Simulations suggest that fine-tuning compact LLMs on NVDRS text requires approximately 1000&#x2010;1500 training data records to achieve an <italic>F</italic><sub>1</sub>-score of at least 0.6. However, substantial variation existed between outcomes. For drive-by shootings and for whether a victim is injured at home, the learning curves flatten at 1000 cases and do not make further <italic>F</italic><sub>1</sub>-score gains with the addition of additional training data. Legal intervention (police shootings) continues to make additional <italic>F</italic><sub>1</sub>-score gains beyond 1000 cases and achieve an <italic>F</italic><sub>1</sub>-score of 0.75 at 2000 cases. Similarly, for the number of victims nonfatally shot, the addition of training data beyond 1000 cases substantially improves the <italic>F</italic><sub>1</sub>-score to 0.66 at 2000 cases.</p><p>In addition, preprocessing data to reduce domain-specific jargon resulted in improved model performance. Oversampling the positive class cases in training data does not increase prediction accuracy substantially over randomly sampled training data. Predictions differed by race, ethnicity, and sex.</p><p>Results suggested that compact LLMs are useful but require training data to correctly classify outcomes of interest. Random sampling and labeling a sufficient number of cases (approximately 1000) combined with a weighting layer is an effective classification strategy. Relative to recent few-shot and zero-shot learning applications using similar data sources [<xref ref-type="bibr" rid="ref29">29</xref>], simulation findings differ, in that the volume of training data required is more substantial. The additional training data may be a function of a class imbalance in the target outcome, as other applications use more prevalent outcomes.</p><p>Differential prediction by subgroup is not explainable by outcome frequency or narrative length alone. For instance, White decedents of police shootings are less prevalent than Black decedents in the sample but are more often classified correctly. Similarly, female decedents have longer median narratives for all outcomes but are less likely to be correctly classified. This finding expands upon the current literature, which has found systematic data missingness in NVDRS [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref40">40</xref>]. Further research should characterize sources of differential prediction, whether input narratives or exacerbation by NLP classifier, and examine fairness-aware models particularly if the prediction is used for decision-making or resource allocation in public health settings.</p></sec><sec id="s4-2"><title>Limitations</title><p>This research is subject to several limitations. First, results from a compact LLM may not fully generalize to new LLMs with additional sophistication or to different language contexts beyond NVDRS. Label noise from NVDRS annotators may mean that results understate the performance of compact LLMs, which is consistent with police shootings tending to be the outcome type that is most accurately predicted. The potential for differential prediction by subgroup raises concerns about fairness and equity in model performance. Further investigations into the sources of this differential prediction are needed to ensure that NLP applications do not exacerbate existing disparities.</p></sec><sec id="s4-3"><title>Conclusions</title><p>Compact LLMs with simple text changes can effectively predict rare NVDRS outcomes. For researchers using supervised machine learning to expand knowledge of violent deaths beyond existing coded fields, applying compact LLMs to sufficient training data can be a valuable approach. While future advancements will likely improve access to privacy-compliant, more sophisticated LLMs for analyzing sensitive data, this study provides a useful baseline for researchers pursuing similar efforts in the interim while underlining the potential for differential prediction by subgroup.</p></sec></sec></body><back><ack><p>The authors thank Matthew Miller and Deb Azrael for comments on a prior draft of this paper. The authors also thank Daniel Bowen and Stephen Sumner for valuable discussion in the development of this paper. Generative artificial intelligence was not used in the course of writing this manuscript. This work was funded by APHA AWARD # 2023-0011.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available due to data restrictions. They are available from the Centers for Disease Control and Prevention National Violent Death Reporting System&#x2019;s Restricted Access Data. These data are available after applying for Restricted Access Data permissions.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1"><italic>ICD</italic></term><def><p><italic>International Classification of Diseases</italic></p></def></def-item><def-item><term id="abb2">LE</term><def><p>law enforcement</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb5">NVDRS</term><def><p>National Violent Death Reporting System</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>WISQARS leading causes of death visualization tool</article-title><source>Centers for Disease Control and Prevention</source><access-date>2025-05-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://wisqars.cdc.gov/lcd">https://wisqars.cdc.gov/lcd</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>National Violent Death Reporting System (NVDRS)</article-title><source>Centers for Disease Control and Prevention</source><year>2024</year><access-date>2025-05-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/nvdrs/about/index.html">https://www.cdc.gov/nvdrs/about/index.html</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chatfield</surname><given-names>SL</given-names> </name><name name-style="western"><surname>DeBois</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Evans</surname><given-names>SD</given-names> </name></person-group><article-title>Mixed methods secondary analysis of older adult homicide-suicides from National Violent Death Reporting System (NVDRS) Data</article-title><source>Am J Qual Res</source><year>2022</year><volume>6</volume><issue>2</issue><fpage>115</fpage><lpage>132</lpage><pub-id pub-id-type="doi">10.29333/ajqr/12129</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fowler</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Leavitt</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Betz</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Dahlberg</surname><given-names>LL</given-names> </name></person-group><article-title>Examining differences between mass, multiple, and single-victim homicides to inform prevention: findings from the National Violent Death Reporting System</article-title><source>Inj Epidemiol</source><year>2021</year><month>08</month><day>9</day><volume>8</volume><issue>1</issue><fpage>49</fpage><pub-id pub-id-type="doi">10.1186/s40621-021-00345-7</pub-id><pub-id pub-id-type="medline">34365969</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rogers</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>J</given-names> </name></person-group><article-title>The research utility of the National Violent Death Reporting System for understanding homicide trends</article-title><source>J Contemp Crim Justice</source><year>2024</year><month>02</month><volume>40</volume><issue>1</issue><fpage>26</fpage><lpage>47</lpage><pub-id pub-id-type="doi">10.1177/10439862231189985</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adhia</surname><given-names>A</given-names> </name><name name-style="western"><surname>Austin</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Fitzmaurice</surname><given-names>GM</given-names> </name><name name-style="western"><surname>Hemenway</surname><given-names>D</given-names> </name></person-group><article-title>The role of intimate partner violence in homicides of children aged 2-14 years</article-title><source>Am J Prev Med</source><year>2019</year><month>01</month><volume>56</volume><issue>1</issue><fpage>38</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2018.08.028</pub-id><pub-id pub-id-type="medline">30416031</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anglemyer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Horvath</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rutherford</surname><given-names>G</given-names> </name></person-group><article-title>The accessibility of firearms and risk for suicide and homicide victimization among household members: a systematic review and meta-analysis</article-title><source>Ann Intern Med</source><year>2014</year><month>01</month><day>21</day><volume>160</volume><issue>2</issue><fpage>101</fpage><lpage>110</lpage><pub-id pub-id-type="doi">10.7326/M13-1301</pub-id><pub-id pub-id-type="medline">24592495</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azrael</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mukamal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Gunnell</surname><given-names>D</given-names> </name><name name-style="western"><surname>Barber</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>M</given-names> </name></person-group><article-title>Identifying and tracking gas suicides in the U.S. using the National Violent Death Reporting System, 2005-2012</article-title><source>Am J Prev Med</source><year>2016</year><month>11</month><volume>51</volume><fpage>S219</fpage><lpage>S225</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2016.08.006</pub-id><pub-id pub-id-type="medline">27745610</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barber</surname><given-names>C</given-names> </name><name name-style="western"><surname>Azrael</surname><given-names>D</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hemenway</surname><given-names>D</given-names> </name></person-group><article-title>Who owned the gun in firearm suicides of men, women, and youth in five US states?</article-title><source>Prev Med</source><year>2022</year><month>11</month><volume>164</volume><fpage>107066</fpage><pub-id pub-id-type="doi">10.1016/j.ypmed.2022.107066</pub-id><pub-id pub-id-type="medline">35461957</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barber</surname><given-names>C</given-names> </name><name name-style="western"><surname>Walters</surname><given-names>H</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hemenway</surname><given-names>D</given-names> </name></person-group><article-title>Suicides at shooting ranges</article-title><source>Crisis</source><year>2021</year><month>01</month><volume>42</volume><issue>1</issue><fpage>13</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1027/0227-5910/a000676</pub-id><pub-id pub-id-type="medline">32343169</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Conner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Azrael</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lyons</surname><given-names>VH</given-names> </name><name name-style="western"><surname>Barber</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>M</given-names> </name></person-group><article-title>Validating the National Violent Death Reporting System as a source of data on fatal shootings of civilians by law enforcement officers</article-title><source>Am J Public Health</source><year>2019</year><month>04</month><volume>109</volume><issue>4</issue><fpage>578</fpage><lpage>584</lpage><pub-id pub-id-type="doi">10.2105/AJPH.2018.304904</pub-id><pub-id pub-id-type="medline">30789773</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>National violent death reporting system web coding manual, 6.0</article-title><source>Centers for Disease Control and Prevention</source><year>2022</year><access-date>2025-05-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://stacks.cdc.gov/view/cdc/44789">https://stacks.cdc.gov/view/cdc/44789</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dang</surname><given-names>LN</given-names> </name><name name-style="western"><surname>Kahsay</surname><given-names>ET</given-names> </name><name name-style="western"><surname>James</surname><given-names>LN</given-names> </name><name name-style="western"><surname>Johns</surname><given-names>LJ</given-names> </name><name name-style="western"><surname>Rios</surname><given-names>IE</given-names> </name><name name-style="western"><surname>Mezuk</surname><given-names>B</given-names> </name></person-group><article-title>Research utility and limitations of textual data in the National Violent Death Reporting System: a scoping review and recommendations</article-title><source>Inj Epidemiol</source><year>2023</year><month>05</month><day>9</day><volume>10</volume><issue>1</issue><fpage>23</fpage><pub-id pub-id-type="doi">10.1186/s40621-023-00433-w</pub-id><pub-id pub-id-type="medline">37161610</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Workman</surname><given-names>TE</given-names> </name><name name-style="western"><surname>Goulet</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Brandt</surname><given-names>CA</given-names> </name><etal/></person-group><article-title>Identifying suicide documentation in clinical notes through zero-shot learning</article-title><source>Health Sci Rep</source><year>2023</year><month>09</month><volume>6</volume><issue>9</issue><fpage>e1526</fpage><pub-id pub-id-type="doi">10.1002/hsr2.1526</pub-id><pub-id pub-id-type="medline">37706016</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fernandes</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Dutta</surname><given-names>R</given-names> </name><name name-style="western"><surname>Velupillai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sanyal</surname><given-names>J</given-names> </name><name name-style="western"><surname>Stewart</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chandran</surname><given-names>D</given-names> </name></person-group><article-title>Identifying suicide ideation and suicidal attempts in a psychiatric clinical research database using natural language processing</article-title><source>Sci Rep</source><year>2018</year><month>05</month><day>9</day><volume>8</volume><issue>1</issue><fpage>7426</fpage><pub-id pub-id-type="doi">10.1038/s41598-018-25773-2</pub-id><pub-id pub-id-type="medline">29743531</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obeid</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Dahne</surname><given-names>J</given-names> </name><name name-style="western"><surname>Christensen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Identifying and predicting intentional self-harm in electronic health record clinical notes: deep learning approach</article-title><source>JMIR Med Inform</source><year>2020</year><month>07</month><day>30</day><volume>8</volume><issue>7</issue><fpage>e17784</fpage><pub-id pub-id-type="doi">10.2196/17784</pub-id><pub-id pub-id-type="medline">32729840</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carson</surname><given-names>NJ</given-names> </name><name name-style="western"><surname>Mullin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Sanchez</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>Identification of suicidal behavior among psychiatrically hospitalized adolescents using natural language processing and machine learning of electronic health records</article-title><source>PLOS ONE</source><year>2019</year><volume>14</volume><issue>2</issue><fpage>e0211116</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0211116</pub-id><pub-id pub-id-type="medline">30779800</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Leonard Westgate</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gui</surname><given-names>J</given-names> </name><name name-style="western"><surname>Watts</surname><given-names>BV</given-names> </name><name name-style="western"><surname>Shiner</surname><given-names>B</given-names> </name></person-group><article-title>Natural language processing of clinical mental health notes may add predictive value to existing suicide risk models</article-title><source>Psychol Med</source><year>2021</year><month>06</month><volume>51</volume><issue>8</issue><fpage>1382</fpage><lpage>1391</lpage><pub-id pub-id-type="doi">10.1017/S0033291720000173</pub-id><pub-id pub-id-type="medline">32063248</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bey</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Trebossen</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Natural language processing of multi-hospital electronic health records for public health surveillance of suicidality</article-title><source>Npj Ment Health Res</source><year>2024</year><month>02</month><day>14</day><volume>3</volume><issue>1</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.1038/s44184-023-00046-7</pub-id><pub-id pub-id-type="medline">38609541</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tabaie</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zeidan</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Evans</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>RN</given-names> </name><name name-style="western"><surname>Kamaleswaran</surname><given-names>R</given-names> </name></person-group><article-title>A novel technique to identify intimate partner violence in a hospital setting</article-title><source>West J Emerg Med</source><year>2022</year><month>09</month><day>12</day><volume>23</volume><issue>5</issue><fpage>781</fpage><lpage>788</lpage><pub-id pub-id-type="doi">10.5811/westjem.2022.7.56726</pub-id><pub-id pub-id-type="medline">36205673</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mason</surname><given-names>AJC</given-names> </name><name name-style="western"><surname>Bhavsar</surname><given-names>V</given-names> </name><name name-style="western"><surname>Botelle</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Applying neural network algorithms to ascertain reported experiences of violence in routine mental healthcare records and distributions of reports by diagnosis</article-title><source>Front Psychiatry</source><year>2024</year><volume>15</volume><fpage>1181739</fpage><pub-id pub-id-type="doi">10.3389/fpsyt.2024.1181739</pub-id><pub-id pub-id-type="medline">39319350</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Botelle</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bhavsar</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kadra-Scalzo</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Can natural language processing models extract and classify instances of interpersonal violence in mental healthcare electronic records: an applied evaluative study</article-title><source>BMJ Open</source><year>2022</year><month>02</month><day>16</day><volume>12</volume><issue>2</issue><fpage>e052911</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2021-052911</pub-id><pub-id pub-id-type="medline">35172999</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Parker</surname><given-names>ST</given-names> </name></person-group><article-title>Estimating nonfatal gunshot injury locations with natural language processing and machine learning models</article-title><source>JAMA Netw Open</source><year>2020</year><month>10</month><day>1</day><volume>3</volume><issue>10</issue><fpage>e2020664</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.20664</pub-id><pub-id pub-id-type="medline">33052403</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Prater</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>EV</given-names> </name><name name-style="western"><surname>Mooney</surname><given-names>SJ</given-names> </name></person-group><article-title>Identifying rare circumstances preceding female firearm suicides: validating a large language model approach</article-title><source>JMIR Ment Health</source><year>2023</year><month>10</month><day>17</day><volume>10</volume><fpage>e49359</fpage><pub-id pub-id-type="doi">10.2196/49359</pub-id><pub-id pub-id-type="medline">37847549</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>An NLP approach to identify SDoH-related circumstance and suicide crisis from death investigation narratives</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>07</month><day>19</day><volume>30</volume><issue>8</issue><fpage>1408</fpage><lpage>1417</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad068</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shamsuddin</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Ralescu</surname><given-names>AL</given-names> </name></person-group><article-title>Classification with class imbalance problem: a review</article-title><source>Int J Adv Soft Comput Appl</source><year>2013</year><volume>29</volume></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Padurariu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Breaban</surname><given-names>ME</given-names> </name></person-group><article-title>Dealing with data imbalance in text classification</article-title><source>Procedia Comput Sci</source><year>2019</year><volume>159</volume><fpage>736</fpage><lpage>745</lpage><pub-id pub-id-type="doi">10.1016/j.procs.2019.09.229</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arseniev-Koehler</surname><given-names>A</given-names> </name><name name-style="western"><surname>Foster</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Mays</surname><given-names>VM</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Cochran</surname><given-names>SDA</given-names> </name></person-group><article-title>Aggression, escalation, and other latent themes in legal intervention deaths of non-Hispanic Black and White men: results from the 2003&#x2012;2017 National Violent Death Reporting System</article-title><source>Am J Public Health</source><year>2021</year><month>07</month><volume>111</volume><issue>S2</issue><fpage>S107</fpage><lpage>S115</lpage><pub-id pub-id-type="doi">10.2105/AJPH.2021.306312</pub-id><pub-id pub-id-type="medline">33984244</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Subramanian</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rahimi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Baldwin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Cohn</surname><given-names>T</given-names> </name><name name-style="western"><surname>Frermann</surname><given-names>L</given-names> </name></person-group><article-title>Fairness-aware class imbalanced learning</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 21, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2109.10444</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shyalika</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wickramarachchi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sheth</surname><given-names>AP</given-names> </name></person-group><article-title>A comprehensive survey on rare event prediction</article-title><source>ACM Comput Surv</source><year>2025</year><month>03</month><day>31</day><volume>57</volume><issue>3</issue><fpage>1</fpage><lpage>39</lpage><pub-id pub-id-type="doi">10.1145/3699955</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xing</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name></person-group><article-title>A machine learning case study to predict rare clinical event of interest: imbalanced data, interpretability, and practical considerations</article-title><source>J Biopharm Stat</source><year>2024</year><month>06</month><day>11</day><volume>0</volume><fpage>1</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1080/10543406.2024.2364722</pub-id><pub-id pub-id-type="medline">38860696</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>N</given-names> </name></person-group><article-title>Large language models in health care: development, applications, and challenges</article-title><source>Health Care Sci</source><year>2023</year><month>08</month><volume>2</volume><issue>4</issue><fpage>255</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.1002/hcs2.61</pub-id><pub-id pub-id-type="medline">38939520</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abd-Alrazaq</surname><given-names>A</given-names> </name><name name-style="western"><surname>AlSaad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alhuwail</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>1</day><volume>9</volume><fpage>e48291</fpage><pub-id pub-id-type="doi">10.2196/48291</pub-id><pub-id pub-id-type="medline">37261894</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jahan</surname><given-names>I</given-names> </name><name name-style="western"><surname>Laskar</surname><given-names>MTR</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>JX</given-names> </name></person-group><article-title>A comprehensive evaluation of large language models on benchmark biomedical text processing tasks</article-title><source>Comput Biol Med</source><year>2024</year><month>03</month><volume>171</volume><fpage>108189</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108189</pub-id><pub-id pub-id-type="medline">38447502</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ge</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Al-Garadi</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Sarker</surname><given-names>A</given-names> </name></person-group><article-title>Comparison of pretraining models and strategies for health-related social media text classification</article-title><source>Healthcare (Basel)</source><year>2022</year><volume>10</volume><issue>8</issue><fpage>1478</fpage><pub-id pub-id-type="doi">10.3390/healthcare10081478</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Divita</surname><given-names>G</given-names> </name><name name-style="western"><surname>Workman</surname><given-names>TE</given-names> </name><name name-style="western"><surname>Redd</surname><given-names>D</given-names> </name><name name-style="western"><surname>Garvin</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Zeng-Treitler</surname><given-names>Q</given-names> </name></person-group><article-title>Clinical sublanguage trend and usage analysis from a large clinical corpus</article-title><conf-name>2020 IEEE International Conference on Big Data (Big Data)</conf-name><conf-date>Dec 10-13, 2020</conf-date><conf-loc>Atlanta, GA, USA</conf-loc><fpage>3837</fpage><lpage>3845</lpage><pub-id pub-id-type="doi">10.1109/BigData50022.2020.9378203</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Workman</surname><given-names>TE</given-names> </name><name name-style="western"><surname>Divita</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zeng-Treitler</surname><given-names>Q</given-names> </name></person-group><article-title>Discovering sublanguages in a large clinical corpus through unsupervised machine learning and information gain</article-title><conf-name>2019 IEEE International Conference on Big Data (Big Data)</conf-name><conf-date>Dec 9-12, 2019</conf-date><conf-loc>Los Angeles, CA, USA</conf-loc><fpage>4889</fpage><lpage>4898</lpage><pub-id pub-id-type="doi">10.1109/BigData47090.2019.9006492</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mezuk</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kalesnikava</surname><given-names>VA</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>C</given-names> </name></person-group><article-title>Not discussed: Inequalities in narrative text data for suicide deaths in the National Violent Death Reporting System</article-title><source>PLoS One</source><year>2021</year><volume>16</volume><issue>7</issue><fpage>e0254417</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0254417</pub-id><pub-id pub-id-type="medline">34270588</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arseniev-Koehler</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mays</surname><given-names>VM</given-names> </name><name name-style="western"><surname>Foster</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Cochran</surname><given-names>SD</given-names> </name></person-group><article-title>Gendered patterns in manifest and latent mental health indicators among suicide decedents: 2003-2020 National Violent Death Reporting System (NVDRS)</article-title><source>Am J Public Health</source><year>2024</year><month>03</month><volume>114</volume><issue>S3</issue><fpage>S268</fpage><lpage>S277</lpage><pub-id pub-id-type="doi">10.2105/AJPH.2023.307427</pub-id><pub-id pub-id-type="medline">37948056</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rahman</surname><given-names>N</given-names> </name><name name-style="western"><surname>Mozer</surname><given-names>R</given-names> </name><name name-style="western"><surname>McHugh</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Rockett</surname><given-names>IRH</given-names> </name><name name-style="western"><surname>Chow</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Vaughan</surname><given-names>G</given-names> </name></person-group><article-title>Using natural language processing to improve suicide classification requires consideration of race</article-title><source>Suicide Life Threat Behav</source><year>2022</year><month>08</month><volume>52</volume><issue>4</issue><fpage>782</fpage><lpage>791</lpage><pub-id pub-id-type="doi">10.1111/sltb.12862</pub-id><pub-id pub-id-type="medline">35384040</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chaumond</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name></person-group><article-title>DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 1, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.01108</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional model performance metrics.</p><media xlink:href="ai_v4i1e68212_app1.docx" xlink:title="DOCX File, 25 KB"/></supplementary-material></app-group></back></article>