<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e72256</article-id><article-id pub-id-type="doi">10.2196/72256</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Model&#x2013;Supported Identification of Intellectual Disabilities in Clinical Free-Text Summaries: Mixed Methods Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Edwards</surname><given-names>Aleksandra</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pardi&#x00F1;as</surname><given-names>Antonio F</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kirov</surname><given-names>George</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Rees</surname><given-names>Elliott</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Camacho-Collados</surname><given-names>Jose</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>School of Computer Science and Informatics, Cardiff University</institution><addr-line>Cathays</addr-line><addr-line>Cardiff</addr-line><country>United Kingdom</country></aff><aff id="aff2"><institution>School of Medicine, Centre for Neuropsychiatric Genetics and Genomics, Division of Psychological Medicine and Clinical Neurosciences, Cardiff University</institution><addr-line>Cardiff</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Liu</surname><given-names>Hongfang</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Elbattah</surname><given-names>Mahmoud</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mohanadas</surname><given-names>Sadhasivam</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kishore Kancharla</surname><given-names>Venkateswara Siva</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Jiang</surname><given-names>Yiqun</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Aleksandra Edwards, PhD, School of Computer Science and Informatics, Cardiff University, Cathays, Cardiff, CF24 4AG, United Kingdom, 1 029 2087 4812; <email>edwardsai@cardiff.ac.uk</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>18</day><month>9</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e72256</elocation-id><history><date date-type="received"><day>06</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>19</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>30</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; Aleksandra Ivaylova Edwards, Antonio F Pardi&#x00F1;as, George Kirov, Elliott Rees, Jose Camacho-Collados. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 18.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e72256"/><abstract><sec><title>Background</title><p>Free-text clinical data are unstructured and narrative in nature, providing a rich source of patient information, but extracting research-quality clinical phenotypes from these data remains a challenge. Manually reviewing and extracting clinical phenotypes from free-text patient notes is a time-consuming process and not suitable for large-scale datasets. On the other hand, automatically extracting clinical phenotypes can be challenging because medical researchers lack gold-standard annotated references and other purpose-built resources, including software. Recent large language models (LLMs) can understand natural language instructions, which help them adapt to different domains and tasks without the need for specific training data. This makes them suitable for clinical applications, though their use in this field is limited.</p></sec><sec><title>Objective</title><p>We aimed to develop an LLM pipeline based on the few-shot learning framework that could extract clinical information from free-text clinical summaries. We assessed the performance of this pipeline for classifying individuals with confirmed or suspected comorbid intellectual disability (ID) from clinical summaries of patients with severe mental illness and performed genetic validation of the results by testing whether individuals with LLM-defined ID carried more genetic variants known to confer risk of ID when compared with individuals without LLM-defined ID.</p></sec><sec sec-type="methods"><title>Methods</title><p>We developed novel approaches for performing classification, based on an intermediate information extraction (IE) step and human-in-the-loop techniques. We evaluated two models: Fine-Tuned Language Text-To-Text Transfer Transformer (Flan-T5) and Large Language Model Architecture (LLaMA). The dataset comprised 1144 free-text clinical summaries, of which 314 were manually annotated and used as a gold standard for evaluating automated methods. We also used published genetic data from 547 individuals to perform a genetic validation of the classification results; Firth&#x2019;s penalized logistic regression framework was used to test whether individuals with LLM-defined ID carry significantly more de novo variants in known developmental disorder risk genes than individuals without LLM-defined ID.</p></sec><sec sec-type="results"><title>Results</title><p>The results demonstrate that a 2-stage approach, combining IE with manual validation, can effectively identify individuals with suspected IDs from free-text patient records, requiring only a single training example per classification label. The best-performing method based on the Flan-T5 model and incorporating the IE step achieved an <italic>F</italic><sub>1</sub>-score of 0.867. Individuals classified as having ID by the best performing model were significantly enriched for de novo variants in known developmental disorder risk genes (odds ratio 29.1, 95% CI 7.36-107; <italic>P</italic>=2.1&#x00D7;10<sup>&#x2212;5</sup>).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs and in-context learning techniques combined with human-in-the-loop approaches can be highly beneficial for extraction and categorization of information from free-text clinical data. In this proof-of-concept study, we show that LLMs can be used to identify individuals with a severe mental illness who also have suspected ID, which is a biologically and clinically meaningful subgroup of patients.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>zero-shot learning</kwd><kwd>intellectual disabilities</kwd><kwd>clinical notes</kwd><kwd>text classification</kwd><kwd>information extraction</kwd><kwd>genetic analysis</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Text classification of clinical data is a challenging problem due to highly specialized terminology, diverse document structures, and the heavy reliance of most methods on annotated data [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Standard approaches to overcome these challenges involve the use of task-specific knowledge and rule-based methods [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>], which make them inapplicable to a wider range of tasks. This is because such methods rely heavily on handcrafted features and domain-specific rules, which do not generalize well beyond the narrow set of conditions they were designed for. Language models are neural network architectures trained to understand and generate human language by learning statistical patterns in large text corpora. Among the most influential are masked language models such as BERT (Bidirectional Encoder Representations From Transformers) and RoBERTa (Robustly Optimized BERT Approach) [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref12">12</xref>], which are pretrained on general-domain text and then fine-tuned&#x2014;that is, further trained on labeled data for a specific downstream task such as text classification. In the biomedical and clinical domains, specialized variants such as BioBERT (BERT for Biomedical Text Mining) [<xref ref-type="bibr" rid="ref13">13</xref>] and Clinical Bidirectional Encoder Representations From Transformers (ClinicalBERT) [<xref ref-type="bibr" rid="ref14">14</xref>] have been developed by continuing pretraining on domain-specific corpora. These models have demonstrated improved performance in tasks like clinical concept extraction and classification. However, despite their success, they still require large amounts of annotated training data to achieve strong performance [<xref ref-type="bibr" rid="ref15">15</xref>], which is often a limiting factor in clinical settings where labeled data are scarce [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. Therefore, methods should ideally work without training data in a zero- or few-shot framework. Recent advances in natural language processing (NLP) have introduced alternative methods using text generation or large language models (LLMs) like Large Language Model Architecture (LLaMA) [<xref ref-type="bibr" rid="ref19">19</xref>], which perform unseen tasks via in-context learning (prompting) [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Prompting involves giving the model natural language instructions that describe the task [<xref ref-type="bibr" rid="ref24">24</xref>]. In few-shot prompting, these instructions are accompanied by a few training examples [<xref ref-type="bibr" rid="ref24">24</xref>]. Unlike fine-tuning, prompting does not modify model weights, making it less resource intensive. Research shows that prompting can match or exceed the performance of standard fine-tuning [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Further gains in zero-shot settings have been achieved by fine-tuning models on task instructions, as in Fine-Tuned Language Net Text-To-Text Transfer Transformer (Flan-T5) [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. However, their application in the clinical domain remains limited.</p><p>In contrast to previous work, we explore a novel approach for performing classification for clinical data. We use an intermediate information extraction (IE) step and human-in-the-loop framework to maximize the performance of in-context learning techniques and LLMs for one-shot setting. As a real-world example, we applied this procedure to free-text clinical summaries in a cohort of patients with severe mental illness and classified individuals with suspected comorbid intellectual disability (ID). These free-text clinical summaries were previously created from discharge patient notes and clinical interviews for research purposes. This is a challenging task as these summaries have diverse structure and terminology and may contain information related to different types of disabilities that can be hard to distinguish without detailed reports of the person&#x2019;s cognitive functioning (eg, &#x201C;learning disability&#x201D; vs &#x201C;intellectual disability&#x201D;) [<xref ref-type="bibr" rid="ref28">28</xref>]. Further, the presence of ID can be described using diverse terminology and be implicitly referred to within the text by recording outcomes and scores from different types of clinical tests and assessments without explicitly mentioning any disability. To provide evidence that our LLM approach can identify a biologically meaningful subgroup of patients, we used genetic data that is available in a subset of patients with schizophrenia to test whether individuals classified by LLMs to have suspected comorbid ID carry significantly more rare variants in ID-associated genes than individuals without LLM-defined ID.</p></sec><sec id="s1-2"><title>Related Work</title><p>We discuss relevant work using LLMs for extracting and classifying information within clinical texts, as well as outline challenges and research gaps within the literature (see sections &#x201C;Text Classification for Clinical Text&#x201D; and &#x201C;Human-in-the-Loop&#x2013;Based Approaches&#x201D;). Finally, we present a use case from psychiatric genetics, a discipline with close relationships to the broader fields of clinical genetics and rare disease research (section &#x201C;Genetic Analysis&#x201D;).</p><sec id="s1-2-1"><title>Text Classification for Clinical Text</title><p>The generalization capabilities of text-generation models in few-shot settings make them suitable for the clinical domain, which often lacks annotated data and is also associated with limited access to datasets and language resources [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Thus, recent research in the clinical NLP field has focused on leveraging and evaluating the performance of such models for various tasks, including classification [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. Some papers focus on providing dataset resources to support easier evaluation and benchmarking of LLMs in zero- and few-shot settings [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], as well as instruction or domain-trained models based on LLaMA [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. These works show promising results for some downstream tasks.</p><p>A similar work to ours by Lu et al [<xref ref-type="bibr" rid="ref33">33</xref>] proposes a knowledge-enhanced prompt learning method for classification of diagnosis from clinical texts. The approach is based on extracting relevant knowledge to the given diagnosis from heterogeneous knowledge resources and integrating it into a prompt sequence along with the clinical note. The authors experiment with more traditional neural network approaches, as well as masked language models. In contrast to this work, we analyze recently developed LLMs such as Flan-T5 and LLaMA, and we focus on exploring in-context learning techniques. In similar research by Fabregat et al [<xref ref-type="bibr" rid="ref31">31</xref>], a Bidirectional Long Short-Term Memory (Bi-LSTM) architecture is used to extract the presence of named disabilities (including IDs) from clinical notes. In our paper, we extract intellectual disabilities, where for some patients these are not mentioned explicitly but instead referred to within the text by using different terminology as well as the results of relevant tests and assessments.</p><p>Another work introduces Generative Pre-Trained Transformer for Biomedical Text Generation and Mining (BioGPT) [<xref ref-type="bibr" rid="ref38">38</xref>], a language model pretrained on large-scale biomedical literature and evaluated on a range of tasks such as relation extraction, question answering (eg, PubMedQA), document classification, and text generation. While it outperforms BERT-based models on these benchmarks, it is built on the older GPT-2 architecture and was trained exclusively on biomedical research articles. This limits its applicability to clinical contexts, particularly for understanding the more informal language often found in patient note summaries. Moreover, BioGPT is relatively small, with only 1.5 billion parameters, making it less capable compared to more recent and larger models such as LLaMA [<xref ref-type="bibr" rid="ref19">19</xref>] and Flan-T5 [<xref ref-type="bibr" rid="ref26">26</xref>], which offer improved generalization, reasoning, and contextual understanding across diverse domains.</p><p>Despite these recent attempts in leveraging LLMs for the clinical domain, most of the work in NLP-related applications for the health care domain is still relying on the use of more data-consuming approaches for text classification and IE [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. In contrast to previous work, we explore different approaches for performing classification for patient notes using an intermediate IE step and human-in-the-loop approach to maximize the performance of in-context learning techniques and LLMs for one-shot setting.</p></sec><sec id="s1-2-2"><title>Human-in-the-Loop&#x2013;Based Approaches</title><p>Incorporating expert knowledge and human validation within automated approaches can have high benefits in the health care domain, given the undesirable consequences of misclassification inherent to some tasks and the subsequent need for highly accurate models [<xref ref-type="bibr" rid="ref39">39</xref>]. Despite this, research in developing such hybrid approaches is very limited, focusing mainly on incorporating domain knowledge within more traditional rule-based and dictionary-based approaches for IE [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. For instance, the authors of [<xref ref-type="bibr" rid="ref40">40</xref>] use a human-in-the-loop approach for constructing a lexicon for extracting medication names from clinical records. The authors of [<xref ref-type="bibr" rid="ref41">41</xref>] use a human-based verification step for building an ontology for structuring radiology reports. In contrast, our work is the first attempt in incorporating human knowledge with state-of-the-art NLP models to develop more accurate text classification approaches for scenarios with no training data available, except for a few examples.</p></sec><sec id="s1-2-3"><title>Genetic Validation of ID Identified by LLM</title><p>Schizophrenia is a severe and highly heritable psychiatric disorder [<xref ref-type="bibr" rid="ref42">42</xref>]. Impaired cognition is a core symptom of schizophrenia that strongly predicts worse functional outcomes [<xref ref-type="bibr" rid="ref43">43</xref>]. Studies have demonstrated that rare and common genetic variation contributes to variation in cognitive ability, or proxies of cognition such as educational attainment, in schizophrenia. For example, we recently showed that genes associated with early onset developmental disorders (including ID) are enriched for damaging rare variants in individuals with schizophrenia and suspected comorbid ID [<xref ref-type="bibr" rid="ref44">44</xref>]. In our previous work, we classified patients with schizophrenia with suspected comorbid ID by manually curating free-text clinical summaries. In this study, we analyzed the same exome-sequencing data and clinical summaries used in [<xref ref-type="bibr" rid="ref44">44</xref>] and tested whether individuals with schizophrenia and LLM-classified comorbid ID carry significantly more damaging rare variants in known ID-associated genes compared with individuals with schizophrenia who did not have LLM-classified ID. Demonstrating that individuals with LLM-classified ID are enriched for such mutations would provide evidence that our LLM approach can identify a biologically meaningful subgroup of patients with schizophrenia.</p></sec></sec><sec id="s1-3"><title>Objectives of This Study</title><p>Our objectives were as follows. First, we aimed to develop a novel 2-step approach that does not require training data for classifying individual patients with suspected ID within psychiatric clinical summaries. Toward this end, we explored approaches that use text generation models coupled with prompting techniques to perform IE for identifying ID-related information from the summaries, after which we performed classification on the extracted text. Second, we aimed to compare the performance of approaches that involve human-in-the-loop techniques and fully automated approaches. Third, we aimed to perform a multifaceted evaluation of model performance and analyze the effect of prompt information type (eg, task definitions vs examples provided as part of the prompt) on the performance for IE and text classification tasks. Lastly, we aimed to perform a genetic validation of the results via an experiment testing whether individuals with LLM-defined ID carry more genetic variants known to confer risk of ID when compared with individuals without LLM-defined ID.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data</title><sec id="s2-1-1"><title>Corpus of Clinical Free-Text Data</title><p>The corpus contained free-text clinical summaries of 1144 individuals with severe mental illness, including schizophrenia and bipolar disorder. The summaries for each person contained information related to symptoms, reactions to prescribed treatments and medications, as well as any other observations that can be clinically relevant, such as the comorbid presence of other illnesses and developmental conditions (including ID). The summaries were not written for the purpose of the current project but at the time of patient recruitment for general research purposes. The summaries contained information from discharge summaries and clinical interviews with psychiatrists. Some clinical summaries also included the Schedules for Clinical Assessment in Neuropsychiatry (SCAN) interview [<xref ref-type="bibr" rid="ref45">45</xref>], which is a semistructured clinical interview used to assess and diagnose psychiatric disorders. In addition, our dataset was a strong representation of real-world free-text clinical summaries, which often present challenges such as inconsistent structure, lack of labeled training data, and the use of diverse terminology where diseases and diagnoses may not be explicitly mentioned. Since the dataset was not specifically collected for this study, it offers potential for a wide range of research applications.</p></sec><sec id="s2-1-2"><title>Dataset Annotation</title><p>We annotated 314 patient notes by manually curating the free text and identifying evidence of ID (see <xref ref-type="table" rid="table1">Table 1</xref> for an overview of the annotation dataset). In addition to these 314 patient summaries, the best performing automatic approaches were used to classify an additional 830 patients.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of the dataset.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Class</td><td align="left" valign="bottom">Number of tests</td><td align="left" valign="bottom">Average number of tokens</td></tr></thead><tbody><tr><td align="left" valign="top">ID<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">29</td><td align="left" valign="top">222</td></tr><tr><td align="left" valign="top">No ID<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">285</td><td align="left" valign="top">185</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">314</td><td align="left" valign="top">190</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ID: intellectual disability.</p></fn><fn id="table1fn2"><p><sup>b</sup>No ID: no evidence of intellectual disability present in the patient note (or lack of evidence).</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s2-2"><title>Methodology</title><sec id="s2-2-1"><title>Pipeline</title><p>We explored three different approaches for performing classification. First, we performed classification on the entire dataset to allow comparison with standard approaches. Second, we performed IE for identifying ID-related information from the summaries, and then we performed classification on the extracted text. Third, we proposed the use of a human-in-the-loop approach as an alternative to a fully automated approach where we use the IE step to extract only the relevant information to the task in a more concise format, which can support experts in performing more efficient and less error-prone annotation of documents.</p></sec><sec id="s2-2-2"><title>Prompting Techniques</title><p>We used three prompting techniques (see <xref ref-type="table" rid="table2">Table 2</xref>) for both IE and classification tasks: <italic>basic</italic>, <italic>definitions,</italic> and <italic>definitions+examples</italic>. This allowed us to identify what type of prompt information (ie, information provided as part of the instruction) is more beneficial for the model&#x2019;s performance.</p><list list-type="order"><list-item><p><italic>Basic prompt</italic>: In this prompt, we simply provided a question to the model without further information about the task.</p></list-item><list-item><p><italic>Definitions</italic>-enhanced prompt: In this prompt, we provided some descriptions about the tasks, that is, definition about intellectual disabilities, along with the question. We provided the same definition for both IE and classification models.</p></list-item><list-item><p><italic>Definitions+examples</italic> prompt: In this prompt, along with the definition, we provided one example per label&#x2014;for the IE task, we provided one example of an output that contains ID-related features and one that does not. The examples were selected randomly from the dataset. For generating data for the test sequences, we used a sampling method.</p></list-item></list><p>The definition used as part of the prompts for performing classification and IE is given in <xref ref-type="other" rid="box1">Textbox 1</xref>. The definition was taken from the <italic>International Statistical Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic> [<xref ref-type="bibr" rid="ref46">46</xref>]), which is a medical classification list by the World Health Organization.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Different prompt learning methods for clinical data.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Information extraction prompts</td><td align="left" valign="bottom">Classification prompts</td></tr></thead><tbody><tr><td align="left" valign="top">Basic prompt<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">What is the evidence of intellectual disability the patient displays from the given patient note?<break/>Patient Note: SZ, IQ of 65., paranoid elements...[Answer]</td><td align="left" valign="top">Does the patient display any evidence of intellectual disability from the given patient note?<break/>Patient Note: SZ, IQ of 65., paranoid elements...[Answer]</td></tr><tr><td align="left" valign="top">Definitions<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">[DEF]<break/>What is the evidence of intellectual disability the patient displays from the given patient note?<break/>Patient Note: SZ, IQ of 65., paranoid elements...[Answer]</td><td align="left" valign="top">[DEF]<break/>Does the patient display any evidence of intellectual disability from the given patient note?<break/>Patient Note: SZ, IQ of 65., paranoid elements...[Answer]</td></tr><tr><td align="left" valign="top">Definitions +examples<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">[DEF]<break/>What is the evidence of intellectual disability the patient displays from the given patient note?<break/>Patient Note: SZ, IQ of 65., paranoid elements...[IQ of 65.]<break/>Patient Note: Started hallucinating; Education: Lawyer...[No evidence.]<break/>Patient Note: SZ, premature birth, attended special school...[Answer.]</td><td align="left" valign="top">[DEF]<break/>Does the patient display any evidence of intellectual disability from the given patient note?<break/>Patient Note: SZ, IQ of 65., paranoid elements... [Yes]<break/>Patient Note: Started hallucinating; Education: Lawyer...[No]<break/>Patient Note: SZ, premature birth, attended special school...[Answer]</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>The &#x201D;Basic prompt&#x201D; method simply adds questions to the clinical summaries.</p></fn><fn id="table2fn2"><p><sup>b</sup>The &#x201C;Definitions&#x201D; prompt incorporates medical knowledge, ie, definition of intellectual disability, within the prompt.</p></fn><fn id="table2fn3"><p><sup>c</sup>The &#x201C;Definitions+examples&#x201D; method adds two annotated examples for both tasks, information extraction (IE) and classification. For example, in the IE task, we provide two patient summaries: one that contains an intellectual disability&#x2013;related feature (eg, &#x201C;IQ of 65&#x201D;) and one that does not. The first supports the presence of the feature and is labeled with the correct answer in square brackets (eg, [&#x201C;IQ of 65&#x201D;]). The second lacks such a feature and is labeled as &#x201C;no evidence.&#x201D; Additionally, new patient summaries marked with &#x201C;[Answer]&#x201D; are examples of where the model is expected to generate a prediction.</p></fn></table-wrap-foot></table-wrap><boxed-text id="box1"><title> Definition used in prompts.</title><p>You are a health care assistant, and you have been asked to identify if the given patient has intellectual disability (ID) for the given patient note. Please use the information about intellectual disabilities given below to extract the right information.</p><p>Intellectual disability (ID), previously known as mental retardation, is a term that is used when an individual has below-average intelligence or mental ability. Intellectual disability (ID) can be identified within the first two years of a child&#x2019;s life if he or she has more severe intellectual disabilities. However, mild intellectual disability may not be identifiable until the child reaches school-age, when challenges with academic learning become present. While it typically occurs during the developmental periods, it is also possible for intellectual disability to develop later as the result of illness or brain injury. Signs and symptoms of intellectual disabilities include: premature birth, delayed development, learning and developing more slowly than other children same age, difficulty communicating or socialising with others, lower than average scores on IQ tests, difficulties talking or talking late, having problems remembering things, inability to connect actions with consequences, difficulty with problem-solving or logical thinking, trouble learning in school, need to attend special school, inability to do everyday tasks like getting dressed or using the restroom without help.</p></boxed-text></sec></sec><sec id="s2-3"><title>Experimental Setup</title><p>In this section, we describe our experimental setting for the task of identifying patients with ID in free-text clinical data.</p><sec id="s2-3-1"><title>Comparison Models</title><p>We performed analysis with LLaMA 2 [<xref ref-type="bibr" rid="ref19">19</xref>] as a representative of a large autoregressive generation model with 70 billion parameters. As a representative of a smaller but instruction-tuned model, we used Flan-T5 [<xref ref-type="bibr" rid="ref26">26</xref>], in particular its XXL version with 11 billion parameters. The model was fine-tuned using the Flan instruction tuning tasks collection [<xref ref-type="bibr" rid="ref26">26</xref>]. The collection also included datasets related to the medical domain and classification tasks. We used the XXL version with 11 billion parameters. We downloaded the models from Hugging Face [<xref ref-type="bibr" rid="ref47">47</xref>]. Due to the sensitivity of the patient notes, we decided against using OpenAI models or other external application programming interfaces requiring data upload for performing analysis. We chose LLaMA and Flan-T5 for our experiments because they are among the most recent, largest, and most versatile language models available, demonstrating strong performance across a wide range of tasks. Notably, Flan-T5&#x2019;s training data includes medical content, which enhances its ability to understand and generate clinically relevant text, which makes it especially suitable for our use case. The model parameters we used for summarization and text classification are as follows: for Flan-T5, we used a temperature of 0.7 and a maximum of 10 and 30 generated tokens for classification and summarization, respectively. These are the default values recommended for these models. We used approximately 24 hours of GPU budget and the Nvidia RTX 4090 GPU (Nvidia Corporation). The implementation is available at GitHub [<xref ref-type="bibr" rid="ref48">48</xref>].</p></sec><sec id="s2-3-2"><title>Evaluation</title><p>We report classification results based on precision, recall, and standard micro- and macroaveraged <italic>F</italic><sub>1</sub> [<xref ref-type="bibr" rid="ref49">49</xref>]. We judged the quality of the data generated during the IE stage based on the performance of the classification model applied to the IE output.</p></sec><sec id="s2-3-3"><title>Genetic Analysis</title><p>We analyzed published genetic data that we previously generated from 547 individuals with schizophrenia who also have free-text clinical data (the paper by Rammos et al [<xref ref-type="bibr" rid="ref44">44</xref>] provides a full description of this sample and genetic dataset). In this genetic analysis, we compared the rate of de novo variants (ie, newly arising mutations that were not inherited from either parent) in known developmental disorder risk genes between patients with suspected comorbid ID and patients without ID using Firth&#x2019;s penalized likelihood logistic regression test, covarying for 10 principal components that were derived from the genetic data to control for genetic ancestry and sex. In the genetic analysis, we examined three classifications of ID: (1) ID defined through manual curation of the clinical summaries; (2) ID defined by the best performing fully automated NLP classification model; and (3) ID by the best performing human-in-the-loop classification model. Based on our previous study, we expected patients with suspected comorbid ID to have significantly more de novo variants in developmental disorder risk genes than patients without comorbid ID [<xref ref-type="bibr" rid="ref44">44</xref>].</p></sec></sec><sec id="s2-4"><title>Ethical Considerations</title><p>The following committees provided ethical approval for this study: Ethics Commission, Higher Medical University, Plovdiv, 4002 V Aprilov Blvd 15a; Protocol Ethics Committee to the Alexander University Hospital, Sofia 1431, 1 St G Sofiisk St, Local Ethics Committee, District Dispensary for psychiatric disorders, Russe, bul; Ethics Committee at the State Psychiatric Hospital &#x201C;Dr Georgi Kisiov,&#x201D; Radnevo, 6269, Magda Petkanova St 1, Radnevo (protocol of 2.10.2000); and Ethics Committee at the District Dispensary for psychiatric disorders, Blagoevgrad (protocol N2/2000). In the United Kingdom, the project was approved by the Bro Taf Local Research Ethics Committee, Churchill House, 17 Churchill Way, Cardiff CF10 2TW, protocol 02/4523. All study participants provided written informed consent with the ability to opt out at any time. Data were deidentified prior to analysis, and no identifying participant information is presented in this study. Participants were not compensated for their inclusion in the study.</p><p>The use of LLMs in processing clinical notes raises significant ethical challenges that must be carefully addressed. Preserving patient privacy is an essential priority when it comes to analyzing clinical notes. Even when deidentified, the risk of reidentification remains, especially with powerful models capable of memorizing or inferring sensitive data. Model transparency is another key concern, making it difficult to interpret LLM decision-making processes and validate their outputs in clinical settings where accountability is crucial. Further, bias in training data can propagate or even amplify existing disparities in health care, potentially leading to skewed predictions [<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>].</p><p>While a comprehensive treatment of these issues lies outside the scope of our current work, we actively considered these ethical concerns throughout our research. To mitigate risks, we have not released the dataset publicly, and we limited our experiments to open-source models. Moreover, our proposed integration of human-in-the-loop methods provides an additional layer of oversight, helping to ensure more responsible and secure use of LLMs in sensitive clinical applications.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>The Role of IE</title><p>Results in <xref ref-type="table" rid="table3">Table 3</xref> show that Flan-T5 consistently outperforms LLaMA 2 regardless of classification approach or prompt use. This suggests that a smaller but instruction-tuned model, pretrained using datasets relevant to the clinical domain, is more suitable for classification in low-resource settings when compared with a bigger model. Further, results show that the performance of IE approaches is highly dependent on the prompt used, where the difference in <italic>F</italic><sub>1</sub>-score for the positive class between the best and worst performing IE approach is around 0.5 (see <xref ref-type="fig" rid="figure1">Figure 1</xref>). A trend in the performance of both models (see <xref ref-type="table" rid="table3">Table 3</xref>) shows that a prompt combining a description of the task and examples leads to the best classification results versus a basic prompt or a prompt based only on definitions. Further, the best results were achieved with the Flan-T5 model using the IE intermediate step with a &#x201C;definitions+examples&#x201D; prompt informing both classification and IE (precision=1.00; recall=0.758) versus performing classification on the entire notes (<italic>P</italic>=.87; <italic>r</italic>=0.77).</p><p>These results show the potential of LLMs and in-context learning techniques to support classification tasks in the clinical domain. However, the performance of Flan-T5 varies with different prompts, whereas the LLaMA model achieves consistent improvements in classification regardless of the prompt. These findings highlight the prompt sensitivity issue in language models, particularly in smaller models like Flan-T5.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Classification results.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Input and classification<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="5">Flan-T5<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom" colspan="5">LLaMA<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="bottom" colspan="2"/><td align="left" valign="bottom">Prec<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="bottom">Rec<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="bottom">F1(pos)<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="bottom">Macro<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td><td align="left" valign="bottom">Acc<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup></td><td align="left" valign="bottom">Prec</td><td align="left" valign="bottom">Rec</td><td align="left" valign="bottom">F1(pos)</td><td align="left" valign="bottom">Macro</td><td align="left" valign="bottom">Acc</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="12">Full notes</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">basic</td><td align="left" valign="top">0.741</td><td align="left" valign="top">0.793</td><td align="left" valign="top">0.767</td><td align="left" valign="top">0.871</td><td align="left" valign="top">0.955</td><td align="left" valign="top">0.083</td><td align="left" valign="top">0.279</td><td align="left" valign="top">0.127</td><td align="left" valign="top">0.403</td><td align="left" valign="top">0.467</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">examples</td><td align="left" valign="top">0.676</td><td align="left" valign="top">0.766</td><td align="left" valign="top">0.718</td><td align="left" valign="top">0.843</td><td align="left" valign="top">0.942</td><td align="left" valign="top">0.122</td><td align="left" valign="top">0.758</td><td align="left" valign="top">0.210</td><td align="left" valign="top">0.406</td><td align="left" valign="top">0.471</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def</td><td align="left" valign="top">0.741</td><td align="left" valign="top">0.689</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.843</td><td align="left" valign="top">0.948</td><td align="left" valign="top">0.113</td><td align="left" valign="top">0.379</td><td align="left" valign="top">0.174</td><td align="left" valign="top">0.482</td><td align="left" valign="top">0.665</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def+ex</td><td align="left" valign="top">0.869</td><td align="left" valign="top">0.689</td><td align="left" valign="top">0.769</td><td align="left" valign="top">0.874</td><td align="left" valign="top">0.961</td><td align="left" valign="top">0.125</td><td align="left" valign="top">0.689</td><td align="left" valign="top">0.212</td><td align="left" valign="top">0.435</td><td align="left" valign="top">0.522</td></tr><tr><td align="left" valign="top" colspan="12">IE (basic)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">basic</td><td align="left" valign="top">0.205</td><td align="left" valign="top">0.827</td><td align="left" valign="top">0.328</td><td align="left" valign="top">0.562</td><td align="left" valign="top">0.686</td><td align="left" valign="top">0.103</td><td align="left" valign="top">0.295</td><td align="left" valign="top">0.152</td><td align="left" valign="top">0.423</td><td align="left" valign="top">0.505</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">examples</td><td align="left" valign="top">0.217</td><td align="left" valign="top">0.896</td><td align="left" valign="top">0.348</td><td align="left" valign="top">0.572</td><td align="left" valign="top">0.690</td><td align="left" valign="top">0.153</td><td align="left" valign="top">0.750</td><td align="left" valign="top">0.254</td><td align="left" valign="top">0.447</td><td align="left" valign="top">0.515</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def</td><td align="left" valign="top">0.180</td><td align="left" valign="top">0.827</td><td align="left" valign="top">0.296</td><td align="left" valign="top">0.525</td><td align="left" valign="top">0.635</td><td align="left" valign="top">0.164</td><td align="left" valign="top">0.464</td><td align="left" valign="top">0.242</td><td align="left" valign="top">0.520</td><td align="left" valign="top">0.681</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def+ex</td><td align="left" valign="top">0.220</td><td align="left" valign="top">0.827</td><td align="left" valign="top">0.348</td><td align="left" valign="top">0.582</td><td align="left" valign="top">0.712</td><td align="left" valign="top">0.174</td><td align="left" valign="top">0.642</td><td align="left" valign="top">0.274</td><td align="left" valign="top">0.511</td><td align="left" valign="top">0.625</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">human</td><td align="left" valign="top">0.294</td><td align="left" valign="top">0.862</td><td align="left" valign="top">0.439</td><td align="left" valign="top">0.657</td><td align="left" valign="top">0.796</td><td align="left" valign="top">0.277</td><td align="left" valign="top">0.517</td><td align="left" valign="top">0.361</td><td align="left" valign="top">0.618</td><td align="left" valign="top">0.792</td></tr><tr><td align="left" valign="top" colspan="12">IE (def)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">basic</td><td align="left" valign="top">0.135</td><td align="left" valign="top">0.827</td><td align="left" valign="top">0.233</td><td align="left" valign="top">0.426</td><td align="left" valign="top">0.491</td><td align="left" valign="top">0.121</td><td align="left" valign="top">0.285</td><td align="left" valign="top">0.170</td><td align="left" valign="top">0.511</td><td align="left" valign="top">0.749</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">examples</td><td align="left" valign="top">0.141</td><td align="left" valign="top">0.896</td><td align="left" valign="top">0.243</td><td align="left" valign="top">0.423</td><td align="left" valign="top">0.479</td><td align="left" valign="top">0.127</td><td align="left" valign="top">0.517</td><td align="left" valign="top">0.204</td><td align="left" valign="top">0.479</td><td align="left" valign="top">0.625</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def</td><td align="left" valign="top">0.126</td><td align="left" valign="top">0.862</td><td align="left" valign="top">0.221</td><td align="left" valign="top">0.388</td><td align="left" valign="top">0.434</td><td align="left" valign="top">0.046</td><td align="left" valign="top">0.178</td><td align="left" valign="top">0.074</td><td align="left" valign="top">0.408</td><td align="left" valign="top">0.598</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def+ex</td><td align="left" valign="top">0.146</td><td align="left" valign="top">0.896</td><td align="left" valign="top">0.251</td><td align="left" valign="top">0.438</td><td align="left" valign="top">0.501</td><td align="left" valign="top">0.184</td><td align="left" valign="top">0.586</td><td align="left" valign="top">0.280</td><td align="left" valign="top">0.554</td><td align="left" valign="top">0.721</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">human</td><td align="left" valign="top">0.896</td><td align="left" valign="top">0.193</td><td align="left" valign="top">0.317</td><td align="left" valign="top">0.536</td><td align="left" valign="top">0.639</td><td align="left" valign="top">0.583</td><td align="left" valign="top">0.233</td><td align="left" valign="top">0.333</td><td align="left" valign="top">0.642</td><td align="left" valign="top">0.910</td></tr><tr><td align="left" valign="top" colspan="12">IE (ex)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">basic</td><td align="left" valign="top">0.487</td><td align="left" valign="top">0.759</td><td align="left" valign="top">0.595</td><td align="left" valign="top">0.770</td><td align="left" valign="top">0.903</td><td align="left" valign="top">0.163</td><td align="left" valign="top">0.571</td><td align="left" valign="top">0.253</td><td align="left" valign="top">0.531</td><td align="left" valign="top">0.696</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">examples</td><td align="left" valign="top">0.489</td><td align="left" valign="top">0.828</td><td align="left" valign="top">0.615</td><td align="left" valign="top">0.780</td><td align="left" valign="top">0.904</td><td align="left" valign="top">0.256</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.377</td><td align="left" valign="top">0.624</td><td align="left" valign="top">0.787</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def</td><td align="left" valign="top">0.500</td><td align="left" valign="top">0.828</td><td align="left" valign="top">0.623</td><td align="left" valign="top">0.785</td><td align="left" valign="top">0.907</td><td align="left" valign="top">0.093</td><td align="left" valign="top">0.428</td><td align="left" valign="top">0.153</td><td align="left" valign="top">0.434</td><td align="left" valign="top">0.574</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def+ex</td><td align="left" valign="top">0.589</td><td align="left" valign="top">0.793</td><td align="left" valign="top">0.676</td><td align="left" valign="top">0.818</td><td align="left" valign="top">0.929</td><td align="left" valign="top">0.327</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.449</td><td align="left" valign="top">0.678</td><td align="left" valign="top">0.841</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">human</td><td align="left" valign="top">0.815</td><td align="left" valign="top">0.759</td><td align="left" valign="top">0.786</td><td align="left" valign="top">0.882</td><td align="left" valign="top">0.962</td><td align="left" valign="top">0.666</td><td align="left" valign="top">0.689</td><td align="left" valign="top">0.677</td><td align="left" valign="top">0.822</td><td align="left" valign="top">0.938</td></tr><tr><td align="left" valign="top" colspan="12">IE (def+ex)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">basic</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.621</td><td align="left" valign="top">0.765</td><td align="left" valign="top">0.873</td><td align="left" valign="top">0.964</td><td align="left" valign="top">0.160</td><td align="left" valign="top">0.592</td><td align="left" valign="top">0.252</td><td align="left" valign="top">0.528</td><td align="left" valign="top">0.691</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">examples</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.655</td><td align="left" valign="top">0.792</td><td align="left" valign="top">0.887</td><td align="left" valign="top">0.967</td><td align="left" valign="top">0.169</td><td align="left" valign="top">0.857</td><td align="left" valign="top">0.282</td><td align="left" valign="top">0.505</td><td align="left" valign="top">0.605</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.655</td><td align="left" valign="top">0.792</td><td align="left" valign="top">0.887</td><td align="left" valign="top">0.967</td><td align="left" valign="top">0.117</td><td align="left" valign="top">0.571</td><td align="left" valign="top">0.194</td><td align="left" valign="top">0.450</td><td align="left" valign="top">0.569</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">def+ex</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.758</td><td align="left" valign="top">0.863</td><td align="left" valign="top">0.887</td><td align="left" valign="top">0.968</td><td align="left" valign="top">0.205</td><td align="left" valign="top">0.857</td><td align="left" valign="top">0.331</td><td align="left" valign="top">0.562</td><td align="left" valign="top">0.686</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">human</td><td align="left" valign="top">1.000</td><td align="left" valign="top">0.767</td><td align="left" valign="top">0.867</td><td align="left" valign="top">0.928</td><td align="left" valign="top">0.978</td><td align="left" valign="top">0.793</td><td align="left" valign="top">0.639</td><td align="left" valign="top">0.707</td><td align="left" valign="top">0.836</td><td align="left" valign="top">0.939</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>The &#x201C;input&#x201D; refers to the type of input passed to the classifier, which is either an entire note or the output from the information extraction (IE) step where &#x201C;IE (basic),&#x201D; &#x201C;IE (def),&#x201D; &#x201C;IE (ex),&#x201D; and &#x201C;IE (def+ex)&#x201D; show prompt types used for the IE step. The &#x201C;prompt (class)&#x201D; refers to the prompt types we used for classification, that is, &#x201C;basic,&#x201D; &#x201C;examples,&#x201D; &#x201C;def,&#x201D; and &#x201C;def+ex.&#x201D; The &#x201C;human&#x201D; classification refers to the human-in-the-loop classification approach.</p></fn><fn id="table3fn2"><p><sup>b</sup>Flan-T5: Fine-Tuned Language Net Text-To-Text Transfer Transformer.</p></fn><fn id="table3fn3"><p><sup>c</sup>LLaMA: Large Language Model Architecture.</p></fn><fn id="table3fn4"><p><sup>d</sup>Prec: precision.</p></fn><fn id="table3fn5"><p><sup>e</sup>Rec: recall.</p></fn><fn id="table3fn6"><p><sup>f</sup>F1(pos): <italic>F</italic><sub>1</sub> for the positive class.</p></fn><fn id="table3fn7"><p><sup>g</sup>macro: <italic>F</italic><sub>1</sub>-macro.</p></fn><fn id="table3fn8"><p><sup>h</sup>Acc: accuracy score.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Comparison between classification approaches, that is, using entire notes versus using IE where &#x201C;F1(pos)&#x201D; refers to the <italic>F</italic><sub>1</sub>-score for the positive class. &#x201C;IE (avg),&#x201D; &#x201C;IE (best),&#x201D; and &#x201C;IE (worst)&#x201D; refer to the average, best, and worst results, respectively, for the IE approach per classification prompt. Classif: classification; Flan-T5: Fine-Tuned Language Net Text-To-Text Transfer Transformer; IE: information extraction; LLaMA: Large Language Model Architecture.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e72256_fig01.png"/></fig></sec><sec id="s3-2"><title>Human-in-the-Loop Approach</title><p>Results in <xref ref-type="fig" rid="figure2">Figure 2</xref> and <xref ref-type="table" rid="table3">Table 3</xref> show that combining IE for extracting task-relevant information and manual classification can support more accurate and less time-consuming classification versus using fully manual or fully automated methods. The human-in-the-loop approach led to better classification performance, especially for the LLaMA model where the difference in F1(pos) between the best performing IE approach and human-based method is 0.436. For the Flan-T5 model, the improvement in F1(pos) is 0.003 (F1(pos)=0.867 vs F1(pos)=0.863). Further, the average length of extracted passages using the FLan-T5 model is 3 tokens, whereas the average length for the entire note is 190 tokens. This shows that the IE step combined with the human-in-the-loop approach can be beneficial for supporting verification or the conduct of more efficient expert annotations. This could be a good alternative to a fully automated classification, especially in the health care domain where accuracy of models and reliability of results are of high importance.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Comparison between fully automated IE-based classification and human-in-the-loop approach where &#x201C;F1(pos)&#x201D; refers to the <italic>F</italic><sub>1</sub>-score for the positive class, &#x201C;classif (best)&#x201D; and &#x201C;classif (worst)&#x201D; refer to the best and worst classification results, respectively, per given IE prompt. Flan-T5: Fine-Tuned Language Net Text-To-Text Transfer Transformer; IE: information extraction; LLaMA: Large Language Model Architecture.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e72256_fig02.png"/></fig></sec><sec id="s3-3"><title>Genetic Validation of Different Classification Approaches</title><p>In all three classifications of ID (manual curation of clinical summaries, best performing fully automated NLP model, and performing human-in-the-loop model; see <xref ref-type="table" rid="table3">Table 3</xref>), damaging de novo variants were significantly enriched in patients with schizophrenia with suspected comorbid ID compared with patients with schizophrenia without ID (<xref ref-type="table" rid="table4">Table 4</xref>). De novo variants were most strongly enriched in the schizophrenia ID group defined by the human-in-the-loop classification (odds ratio 29.1, 95% CI 7.36-107), with the weakest enrichment observed in the fully automated classification (odds ratio 15.7, 95% CI 3.58-57.5). The same set of de novo variants was observed in the ID and non-ID patient groups in the human-in-the-loop and manual curation classifications, but a greater enrichment was observed in the human-in-the-loop classification test as fewer individuals were classified to have ID (14 in the human-in-the-loop classification vs 18 in the manual curation classification; <xref ref-type="table" rid="table4">Table 4</xref>).</p><p>To investigate why fewer people were found to have ID in the human-in-the-loop classification dataset, we examined the overlap of individuals with ID in this dataset and the manually curated classification dataset. We also reexamined the clinical information for individuals found to have ID in only one dataset. Eleven individuals with schizophrenia were recorded as having ID in both the manually curated and human-in-the-loop classification datasets. Among the 7 individuals who were found to have ID only in the manually curated classification dataset, 2 had clear evidence of having ID, 2 had ambiguous evidence of having ID, and 3 had no evidence of having ID. Among the 3 individuals found to have ID only in the human-in-the-loop classification dataset, 1 had clear evidence of having ID and 2 had ambiguous evidence of having ID. These results provide suggestive evidence that the human-in-the-loop approach produces fewer false positive ID classifications when compared with the manually curated ID set.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Enrichment of de novo variants in individuals with SZ<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> and comorbid ID<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Classifier<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="bottom">Patients with SZ and ID, n</td><td align="left" valign="bottom">Patients with SZ but without ID, n</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">Odds ratio (95% CI)</td><td align="left" valign="bottom">Variants in ID group, n (rate)</td><td align="left" valign="bottom">Variants in no-ID group, n (rate)</td></tr></thead><tbody><tr><td align="left" valign="top">Manual curation</td><td align="left" valign="top">18</td><td align="left" valign="top">529</td><td align="left" valign="top">7.1&#x00D7;10<sup>&#x2013;5</sup></td><td align="left" valign="top">21.1 (5.48-74.0)</td><td align="left" valign="top">4 (0.22)</td><td align="left" valign="top">7 (0.013)</td></tr><tr><td align="left" valign="top">Automatic model</td><td align="left" valign="top">16</td><td align="left" valign="top">531</td><td align="left" valign="top">9.3&#x00D7;10<sup>&#x2013;5</sup></td><td align="left" valign="top">15.7 (3.58-57.5)</td><td align="left" valign="top">3 (0.19)</td><td align="left" valign="top">8 (0.015)</td></tr><tr><td align="left" valign="top">Human-in-the-loop</td><td align="left" valign="top">14</td><td align="left" valign="top">533</td><td align="left" valign="top">2.1&#x00D7;10<sup>&#x2013;5</sup></td><td align="left" valign="top">29.1 (7.36-107)</td><td align="left" valign="top">4 (0.29)</td><td align="left" valign="top">7 (0.013)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SZ: schizophrenia.</p></fn><fn id="table4fn2"><p><sup>b</sup>ID: intellectual disability.</p></fn><fn id="table4fn3"><p><sup>c</sup>The numbers of patients with ID and without ID are presented across three ID classifications where &#x201C;manual curation&#x201D; refers to manual annotations performed by a domain expert, &#x201C;automatic model&#x201D; refers to the best performing fully automatic model in <xref ref-type="table" rid="table3">Table 3</xref>, and &#x201C;human-in-the-loop&#x201D; refers to the best performing human-in-the-loop approach in <xref ref-type="table" rid="table3">Table 3</xref>.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our findings show the potential of LLMs to facilitate different tasks in the clinical domain, such as classification and IE when only a few training examples are available. We compared three approaches for classifying ID from the clinical summaries of individuals with severe mental illness and found that using an IE step as part of a classification pipeline based on the Flan-T5 model and informed by a prompt combining definitions and examples achieved a precision of 1.00 and <italic>F</italic><sub>1</sub> of 0.867 for the positive class. Further improvements to classification were found when using the human-in-the-loop approach, a process where a human must review short NLP-derived summaries instead of the full clinical dataset. These findings open interesting research avenues in building hybrid approaches, which combine the benefits LLMs offer for extracting relevant information in a fast and efficient manner with the knowledge of experts. These kinds of methods can be suitable for classification tasks which are considered challenging even for domain experts, as well as for sensitive tasks that require high accuracy, which is typically hard to achieve when labeled training data is scarce. Moreover, the ability of our methods to accurately classify ID was supported by our genetic analysis, which found the NLP-defined subgroup of people with schizophrenia and ID to be enriched for genetic variants in genes known to be associated with ID. From conversations with geneticists in the area, we found that manual annotation of comorbidities is often based on keyword search, since reading of thousands of clinical notes is typically unfeasible. Our findings therefore suggest that NLP-based approaches can be used to validate or improve classification annotations derived from the manual curation of free-text clinical summaries, which is prone to human error. Further work is required to test these findings in routinely collected health care data, such as that captured in electronic health records, and how to best integrate into genetic analysis research and beyond the detection of IDs which was used as a first approximation to the more general problem of extracting information from free-text clinical summaries.</p></sec><sec id="s4-2"><title>Importance of Both Automation and Human Intervention in Health Care Applications</title><p>We have performed further analysis comparing the execution time efficiency of the best performing automatic and human-in-the-loop approaches to the manual methods used by experts for annotating the data (see <xref ref-type="table" rid="table5">Table 5</xref>). Specifically, the manual curation involved careful reading through the entire dataset to classify them. In the keyword search, experts perform simple searches using the operating system's search functionality to find potential class candidates. For these experiments, they used the keywords &#x201C;IQ&#x201D; and &#x201C;mental handicap&#x201D;. The results in <xref ref-type="table" rid="table5">Table 5</xref> show the benefits of the automatic and human-in-the-loop approach versus the human-based annotation where the fully automated method is more than 20 times faster than manual curation and keyword-based search. In addition, keyword search has a slightly lower <italic>F</italic><sub>1</sub>-score than automatic approaches. The reason for this is the limited number of keywords used in experiments. However, these results still highlight potential problems with such an approach where a careful selection of keywords is needed as well as a good knowledge of the corpus. Further, the human-in-the-loop approach performs at a very similar execution time to the fully automated approach but produces a higher <italic>F</italic><sub>1</sub>-score. Perhaps even more importantly, the fact that an expert can be involved in the process provides increased reassurance compared to a fully automatic process. This shows that semiautomatic approaches can be an efficient and more reliable option versus full automation for the health care domain where high accuracy of models is required. However, research in this field is still very limited. Our work is one of the first attempts to incorporate manual verification and LLMs to create more reliable and less data-consuming approaches for classification of clinical free-text data.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Comparison of the performance of classification approaches in terms of time taken for annotating 314 clinical summaries.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Approach</td><td align="left" valign="bottom">Time</td><td align="left" valign="bottom">F1(pos)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Manual curation<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">&#x223C;10 h</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top">Keyword search<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">&#x223C;2 h</td><td align="left" valign="top">0.845</td></tr><tr><td align="left" valign="top">Automatic model<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">&#x223C;5 min</td><td align="left" valign="top">0.863</td></tr><tr><td align="left" valign="top">Human-in-the-loop<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">&#x223C;20 min</td><td align="left" valign="top">0.863</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>F1(pos): <italic>F</italic><sub>1</sub> for the positive class.</p></fn><fn id="table5fn2"><p><sup>b</sup>&#x201C;Manual curation&#x201D; refers to manual annotations performed by a domain expert. </p></fn><fn id="table5fn3"><p><sup>c</sup>&#x201C;Keyword search&#x201D; refers to CTRL-F&#x2013;based search using the keywords &#x201C;IQ&#x201D; and &#x201C;mental handicap.&#x201D;</p></fn><fn id="table5fn4"><p><sup>d</sup>&#x201C;Automatic model&#x201D; refers to the best performing fully automatic model in <xref ref-type="table" rid="table3">Table 3</xref>.</p></fn><fn id="table5fn5"><p><sup>e</sup>&#x201C;Human-in-the-loop&#x201D; refers to the best performing human-in-the-loop approach in <xref ref-type="table" rid="table3">Table 3</xref>.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4-3"><title>Limitations</title><p>First, this study is based on a single free-text clinical dataset containing a summary of real-world clinical data, which was developed for research purposes. This dataset is not a direct copy of routinely collected clinical data, which may limit the generalization of the results. To mitigate this issue, we used unsupervised approaches that are aimed at modeling the problem at hand, without relying on training data that may easily be overfit to our own data. Nonetheless, these experiments would ideally need to be replicated on similar cohorts to better understand the strengths and limitations. Second, and related to the first point, the experiments were performed in English only, which is the language of the corpus of clinical summaries. Third, for this analysis, we simplified the task into a binary problem, in which patients have suspected comorbid ID or no evidence of ID. However, the clinical course of many medical conditions is complex, with high heterogeneity in the type and severity of symptoms both across individuals and within individuals over time. This is particularly true for mental health conditions, where individuals often receive different diagnoses during their life. It is therefore important for future studies to consider how models can adapt and capture these complex clinical phenotypes from longitudinal health care data. Although outside the scope of this paper, we note that our approach offers a promising path for this task, since it leverages in-context learning and integrates both labeled data and regularly updated, domain-specific external resources.</p><p>Another limitation of our study was that all manual annotations were performed by a single expert. This was due to the complexity and domain-specific expertise required for the task, making it unfeasible to involve multiple annotators. To ensure the reliability of the automated methods, we validated them not only against human annotations but also by using genetic data to confirm that individuals with NLP-defined ID are enriched for genetic variants known to be associated with ID.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In this study, we analyze how language models such as Flan-T5 and LLaMA 2 combined with in-context learning can be utilized for classifying individuals with severe mental illness and suspected ID from free-text clinical summaries. We propose the use of an intermediate IE step for extracting relevant parts of the notes before classification. Our results show that such techniques can help improve the performance of LLMs in one-shot settings when combined with a prompt that provides both information about the task and relevant examples. In addition, we propose a human-in-the-loop approach as an alternative to fully automated classification, where the IE step is used to extract succinct parts of the notes related to the task, which can be used to support faster and less error-prone manual classification. Approaches based on this pipeline and the Flan-T5 model showed promising results and were validated in a proof-of-concept genetic analysis, which found individuals classified by NLP to have ID were enriched for genetic variants known to contribute to developmental disorders.</p></sec></sec></body><back><ack><p>AFP was supported by grants from UK Research and Innovation (UKRI)&#x2019;s Medical Research Council (MR/Y004094/1 and MR/Z503745/1) and the European Commission Horizon Programmes 2020 (&#x201C;REALMENT&#x201D;; 964874) and 2021 (&#x201C;PsychSTRATA&#x201D;; 101057454). ER was supported by UKRI Future Leaders Fellowship Grants (MR/T018712/1 and MR/Y033922/1) and a Medical Research Council grant (MR/Y004094/1). JCC was supported by UKRI Future Leaders Fellowships.</p></ack><notes><sec><title>Data Availability</title><p>Individuals contributing to this study did not consent to the public sharing of their highly sensitive clinical data, and we do not have ethical approval to make their data publicly available. The genetic exome sequencing data used in this study was previously deposited in Database of Genotypes and Phenotypes (dbGaP) with accession no. phs000687.</p></sec></notes><fn-group><fn fn-type="conflict"><p>ER and AFP report receiving grants from Akrivia Health for a project unrelated to this work. Akrivia played no part in the conception, design, implementation, or interpretation of this study. All other authors report no conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations From Transformers</p></def></def-item><def-item><term id="abb2">Bi-LSTM</term><def><p>Bidirectional Long Short-Term Memory</p></def></def-item><def-item><term id="abb3">BioBERT</term><def><p>Bidirectional Encoder Representations From Transformers for Biomedical Text Mining</p></def></def-item><def-item><term id="abb4">BioGPT</term><def><p>Generative Pre-Trained Transformer for Biomedical Text Generation and Mining</p></def></def-item><def-item><term id="abb5">ClinicalBERT</term><def><p>Clinical Bidirectional Encoder Representations From Transformers</p></def></def-item><def-item><term id="abb6">Flan-T5</term><def><p>Fine-Tuned Language Net Text-To-Text Transfer Transformer</p></def></def-item><def-item><term id="abb7"><italic>ICD-10</italic></term><def><p><italic>International Statistical Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb8">ID</term><def><p>intellectual disability</p></def></def-item><def-item><term id="abb9">IE</term><def><p>information extraction</p></def></def-item><def-item><term id="abb10">LLaMA</term><def><p>Large Language Model Architecture</p></def></def-item><def-item><term id="abb11">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb12">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb13">RoBERTa</term><def><p>Robustly Optimized BERT Approach</p></def></def-item><def-item><term id="abb14">SCAN</term><def><p>Schedules for Clinical Assessment in Neuropsychiatry</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hegselmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sontag</surname><given-names>D</given-names> </name></person-group><article-title>Large language models are few-shot clinical information extractors</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 7-11, 2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates</conf-loc><fpage>1998</fpage><lpage>2022</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.emnlp-main">https://aclanthology.org/2022.emnlp-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.130</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Crowley</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name></person-group><article-title>Coreference resolution: a review of general methodologies and applications in the clinical domain</article-title><source>J Biomed Inform</source><year>2011</year><month>12</month><volume>44</volume><issue>6</issue><fpage>1113</fpage><lpage>1122</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2011.08.006</pub-id><pub-id pub-id-type="medline">21856441</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chapman</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Bridewell</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hanbury</surname><given-names>P</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>GF</given-names> </name><name name-style="western"><surname>Buchanan</surname><given-names>BG</given-names> </name></person-group><article-title>A simple algorithm for identifying negated findings and diseases in discharge summaries</article-title><source>J Biomed Inform</source><year>2001</year><month>10</month><volume>34</volume><issue>5</issue><fpage>301</fpage><lpage>310</lpage><pub-id pub-id-type="doi">10.1006/jbin.2001.1029</pub-id><pub-id pub-id-type="medline">12123149</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Berkowitz</surname><given-names>SJ</given-names> </name><etal/></person-group><article-title>MIMIC-CXR, a de-identified publicly available database of chest radiographs with free-text reports</article-title><source>Sci Data</source><year>2019</year><month>12</month><day>12</day><volume>6</volume><issue>1</issue><fpage>317</fpage><pub-id pub-id-type="doi">10.1038/s41597-019-0322-0</pub-id><pub-id pub-id-type="medline">31831740</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>YF</given-names> </name><name name-style="western"><surname>Henry</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Uzuner</surname><given-names>O</given-names> </name><name name-style="western"><surname>Rumshisky</surname><given-names>A</given-names> </name></person-group><article-title>The 2019 n2c2/UMass Lowell shared task on clinical concept normalization</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>10</month><day>1</day><volume>27</volume><issue>10</issue><fpage>1529</fpage><lpage>e1</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa106</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Skreta</surname><given-names>M</given-names> </name><name name-style="western"><surname>Arbabi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Automatically disambiguating medical acronyms with ontology-aware deep learning</article-title><source>Nat Commun</source><year>2021</year><month>09</month><day>7</day><volume>12</volume><issue>1</issue><fpage>5319</fpage><pub-id pub-id-type="doi">10.1038/s41467-021-25578-4</pub-id><pub-id pub-id-type="medline">34493718</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Unified language model pre-training for natural language understanding and generation</article-title><conf-name>NIPS&#x2019;19: 33rd International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 8-14, 2019</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><fpage>13063</fpage><lpage>13075</lpage><pub-id pub-id-type="doi">10.5555/3454287.3455457</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of NAACL-HLT Association for Computational Linguistics</conf-name><conf-date>Jun 2-7, 2019</conf-date><conf-loc>Minneapolis, Minnesota</conf-loc><fpage>4171</fpage><lpage>4186</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/N19-1423.pdf">https://aclanthology.org/N19-1423.pdf</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mosbach</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pimentel</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ravfogel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Klakow</surname><given-names>D</given-names> </name><name name-style="western"><surname>Elazar</surname><given-names>Y</given-names> </name></person-group><article-title>Few-shot fine-tuning vs. in-context learning: a fair comparison and evaluation</article-title><access-date>2025-09-15</access-date><conf-name>Findings of the Association for Computational Linguistics: ACL 2023</conf-name><conf-date>Jul 9-14, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.findings-acl">https://aclanthology.org/2023.findings-acl</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.findings-acl.779</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Improving language understanding by generative pre-training</article-title><source>OpenAI</source><year>2018</year><access-date>2025-09-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Viswanathan</surname><given-names>V</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bertsch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Neubig</surname><given-names>G</given-names> </name></person-group><article-title>Prompt2Model: generating deployable models from natural language instructions</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</conf-name><conf-date>Dec 6-10, 2023</conf-date><conf-loc>Singapore</conf-loc><fpage>413</fpage><lpage>421</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.emnlp-demo">https://aclanthology.org/2023.emnlp-demo</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-demo.38</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yin</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hay</surname><given-names>J</given-names> </name><name name-style="western"><surname>Roth</surname><given-names>D</given-names> </name></person-group><article-title>Benchmarking zero-shot text classification: datasets, evaluation and entailment approach</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name><conf-date>Nov 3-7, 2019</conf-date><conf-loc>Hong Kong, China</conf-loc><fpage>3914</fpage><lpage>3923</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/D19-1">https://www.aclweb.org/anthology/D19-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/D19-1404</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Boag</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Publicly available clinical BERT embeddings</article-title><conf-name>Proceedings of the 2nd Clinical Natural Language Processing Workshop, Anna Rumshisky</conf-name><conf-date>Jun 7-8, 2019</conf-date><conf-loc>Minneapolis, Minnesota, USA</conf-loc><fpage>72</fpage><lpage>78</lpage><pub-id pub-id-type="doi">10.18653/v1/W19-1909</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Edwards</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ushio</surname><given-names>A</given-names> </name><name name-style="western"><surname>Camacho-Collados</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ribaupierre</surname><given-names>H</given-names> </name><name name-style="western"><surname>Preece</surname><given-names>A</given-names> </name></person-group><article-title>Guiding generative language models for data augmentation in few-shot text classification</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the Fourth Workshop on Data Science with Human-in-the-Loop (Language Advances)</conf-name><conf-date>Dec 8-9, 2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates</conf-loc><fpage>51</fpage><lpage>63</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.dash-1.8.pdf">https://aclanthology.org/2022.dash-1.8.pdf</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Giridhara</surname><given-names>PKB</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>C</given-names> </name><name name-style="western"><surname>Venkataramana</surname><given-names>RKM</given-names> </name><name name-style="western"><surname>Bukhari</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Dengel</surname><given-names>A</given-names> </name></person-group><article-title>A study of various text augmentation techniques for relation classification in free text</article-title><conf-name>8th International Conference on Pattern Recognition Applications and Methods</conf-name><conf-date>Feb 19-21, 2019</conf-date><conf-loc>Prague, Czech Republic</conf-loc><fpage>360</fpage><lpage>367</lpage><pub-id pub-id-type="doi">10.5220/0007311003600367</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>T&#x00FC;rker</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Koutraki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sack</surname><given-names>H</given-names> </name></person-group><article-title>Knowledge-based short text categorization using entity and category embedding</article-title><conf-name>The Semantic Web: 16th International Conference, ESWC 2019</conf-name><conf-date>Jun 2-6, 2019</conf-date><conf-loc>Portoro&#x017E;, Slovenia</conf-loc><fpage>346</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-21348-0_23</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>B</given-names> </name></person-group><article-title>Short text classification based on feature extension using the N-gram model</article-title><conf-name>2015 12th International Conference on Fuzzy Systems and Knowledge Discovery (FSKD)</conf-name><conf-date>Aug 15-17, 2015</conf-date><conf-loc>Zhangjiajie, China</conf-loc><fpage>710</fpage><lpage>716</lpage><pub-id pub-id-type="doi">10.1109/FSKD.2015.7382029</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Le Scao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rush</surname><given-names>AM</given-names> </name></person-group><article-title>How many data points is a prompt worth?</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 6-11, 2021</conf-date><conf-loc>Online</conf-loc><fpage>2627</fpage><lpage>2636</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2021.naacl-main">https://aclanthology.org/2021.naacl-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.208</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Plaza-del-Arco</surname><given-names>FM</given-names> </name><name name-style="western"><surname>Nozza</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hovy</surname><given-names>D</given-names> </name></person-group><article-title>Leveraging label variation in large language models for zero-shot text classification</article-title><comment>Preprint posted online on  Jul 24, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.12973</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Child</surname><given-names>R</given-names> </name><name name-style="western"><surname>Luan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Amodei</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Language models are unsupervised multitask learners</article-title><source>OpenAI</source><year>2019</year><access-date>2025-09-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf">https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Schick</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sch&#x00FC;tze</surname><given-names>H</given-names> </name></person-group><article-title>Exploiting cloze-questions for few-shot text classification and natural language inference</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics</conf-name><conf-date>Apr 19-23, 2021</conf-date><conf-loc>Online</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2021.eacl-main">https://aclanthology.org/2021.eacl-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2021.eacl-main.20</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Labrak</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rouvier</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dufour</surname><given-names>R</given-names> </name></person-group><article-title>A zero-shot and few-shot study of instruction-finetuned large language models applied to clinical and biomedical tasks</article-title><access-date>2025-09-15</access-date><conf-name>Fourteenth Language Resources and Evaluation Conference (LREC-COLING 2024) Association for Computational Linguistics</conf-name><conf-date>May 20-25, 2024</conf-date><conf-loc>Online</conf-loc><fpage>2049</fpage><lpage>2066</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.lrec-main.185.pdf">https://aclanthology.org/2024.lrec-main.185.pdf</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Fisch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name></person-group><article-title>Making pre-trained language models better few-shot learners</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</conf-name><conf-date>Aug 1-6, 2021</conf-date><conf-loc>Online</conf-loc><fpage>3816</fpage><lpage>3830</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2021.acl-long">https://aclanthology.org/2021.acl-long</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.295</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chung</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Longpre</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Scaling instruction-finetuned language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 20, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2210.11416</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raffel</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Exploring the limits of transfer learning with a unified text-to-text transformer</article-title><source>J Mach Learn Res</source><year>2020</year><access-date>2025-09-15</access-date><volume>21</volume><issue>140</issue><fpage>1</fpage><lpage>67</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://jmlr.org/papers/v21/20-074.html">http://jmlr.org/papers/v21/20-074.html</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cornoldi</surname><given-names>C</given-names> </name><name name-style="western"><surname>Giofr&#x00E8;</surname><given-names>D</given-names> </name><name name-style="western"><surname>Orsini</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pezzuti</surname><given-names>L</given-names> </name></person-group><article-title>Differences in the intellectual profile of children with intellectual vs. learning disability</article-title><source>Res Dev Disabil</source><year>2014</year><month>09</month><volume>35</volume><issue>9</issue><fpage>2224</fpage><lpage>2230</lpage><pub-id pub-id-type="doi">10.1016/j.ridd.2014.05.013</pub-id><pub-id pub-id-type="medline">24927516</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Abad</surname><given-names>ZSH</given-names> </name><etal/></person-group><article-title>Foundation models for generalist medical artificial intelligence</article-title><source>Nature New Biol</source><year>2023</year><month>04</month><volume>616</volume><issue>7956</issue><fpage>259</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-05881-4</pub-id><pub-id pub-id-type="medline">37045921</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sevilla</surname><given-names>J</given-names> </name><name name-style="western"><surname>Heim</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>A</given-names> </name><name name-style="western"><surname>Besiroglu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hobbhahn</surname><given-names>M</given-names> </name><name name-style="western"><surname>Villalobos</surname><given-names>P</given-names> </name></person-group><article-title>Compute trends across three eras of machine learning</article-title><conf-name>2022 International Joint Conference on Neural Networks (IJCNN)</conf-name><conf-date>Jul 18-23, 2022</conf-date><conf-loc>Padua, Italy</conf-loc><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1109/IJCNN55064.2022.9891914</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fabregat</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martinez-Romo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Araujo</surname><given-names>L</given-names> </name></person-group><article-title>Understanding and improving disability identification in medical documents</article-title><source>IEEE Access</source><year>2020</year><volume>8</volume><fpage>155399</fpage><lpage>155408</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2020.3019178</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Dan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>ChatDoctor: a medical chat model fine-tuned on a large language model meta-AI (LLaMA) using medical domain knowledge</article-title><source>Cureus</source><year>2023</year><month>06</month><volume>15</volume><issue>6</issue><fpage>e40895</fpage><pub-id pub-id-type="doi">10.7759/cureus.40895</pub-id><pub-id pub-id-type="medline">37492832</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name></person-group><article-title>Medical knowledge-enhanced prompt learning for diagnosis classification from clinical text</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 5th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jul 14-15, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><fpage>278</fpage><lpage>288</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.clinicalnlp-1">https://aclanthology.org/2023.clinicalnlp-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.clinicalnlp-1.33</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Meng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Shareghi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Su</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>C</given-names> </name><name name-style="western"><surname>Collier</surname><given-names>N</given-names> </name></person-group><article-title>Rewire-then-probe: a contrastive recipe for probing biomedical knowledge of pre-trained language models</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>May 22-27, 2022</conf-date><conf-loc>Dublin, Ireland</conf-loc><fpage>4798</fpage><lpage>4810</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.acl-long">https://aclanthology.org/2022.acl-long</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.329</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Salek Faramarzi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bandarupally</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>R</given-names> </name></person-group><article-title>Context-aware medication event extraction from unstructured text</article-title><access-date>2025-09-15</access-date><conf-name>Proceedings of the 5th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jul 14-15, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><fpage>86</fpage><lpage>95</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.clinicalnlp-1">https://aclanthology.org/2023.clinicalnlp-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.clinicalnlp-1.11</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tran</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name></person-group><article-title>BioInstruct: instruction tuning of large language models for biomedical natural language processing</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1821</fpage><lpage>1832</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae122</pub-id><pub-id pub-id-type="medline">38833265</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wornow</surname><given-names>M</given-names> </name><name name-style="western"><surname>Thapa</surname><given-names>R</given-names> </name><name name-style="western"><surname>Steinberg</surname><given-names>E</given-names> </name><name name-style="western"><surname>Fries</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>N</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Oh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Naumann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Globerson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Saenko</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hardt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Levine</surname><given-names>S</given-names> </name></person-group><article-title>EHRSHOT: an EHR benchmark for few-shot evaluation of foundation models</article-title><source>Advances in Neural Information Processing Systems 36 (NeurIPS 2023)</source><year>2023</year><volume>36</volume><publisher-name>NeurIPS</publisher-name><fpage>67125</fpage><lpage>67137</lpage></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>BioGPT: generative pre-trained transformer for biomedical text generation and mining</article-title><source>Brief Bioinformatics</source><year>2022</year><month>11</month><day>19</day><volume>23</volume><issue>6</issue><fpage>bbac409</fpage><pub-id pub-id-type="doi">10.1093/bib/bbac409</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name></person-group><article-title>AI in health: keeping the human in the loop</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>06</month><day>20</day><volume>30</volume><issue>7</issue><fpage>1225</fpage><lpage>1226</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad091</pub-id><pub-id pub-id-type="medline">37337923</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ruis</surname><given-names>F</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>S</given-names> </name><name name-style="western"><surname>Geerdink</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hegeman</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Seifert</surname><given-names>C</given-names> </name><name name-style="western"><surname>Van Keulen</surname><given-names>M</given-names> </name></person-group><article-title>Human-in-the-loop language-agnostic extraction of medication data from highly unstructured electronic health records</article-title><conf-name>2020 International Conference on Data Mining Workshops (ICDMW)</conf-name><conf-date>Nov 17-20, 2020</conf-date><conf-loc>Sorrento, Italy</conf-loc><fpage>644</fpage><lpage>650</lpage><pub-id pub-id-type="doi">10.1109/ICDMW51313.2020.00091</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Syed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>H</given-names> </name><etal/></person-group><article-title>AI accelerated hman-in-the-loop structuring of radiology reports</article-title><source>AMIA Annu Symp Proc</source><year>2020</year><volume>2020</volume><fpage>1305</fpage><lpage>1314</lpage><pub-id pub-id-type="medline">33936507</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Owen</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Legge</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Rees</surname><given-names>E</given-names> </name><name name-style="western"><surname>Walters</surname><given-names>JTR</given-names> </name><name name-style="western"><surname>O&#x2019;Donovan</surname><given-names>MC</given-names> </name></person-group><article-title>Genomic findings in schizophrenia and their implications</article-title><source>Mol Psychiatry</source><year>2023</year><month>09</month><volume>28</volume><issue>9</issue><fpage>3638</fpage><lpage>3647</lpage><pub-id pub-id-type="doi">10.1038/s41380-023-02293-8</pub-id><pub-id pub-id-type="medline">37853064</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Green</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Horan</surname><given-names>WP</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name></person-group><article-title>Nonsocial and social cognition in schizophrenia: current evidence and future directions</article-title><source>World Psychiatry</source><year>2019</year><month>06</month><volume>18</volume><issue>2</issue><fpage>146</fpage><lpage>161</lpage><pub-id pub-id-type="doi">10.1002/wps.20624</pub-id><pub-id pub-id-type="medline">31059632</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rammos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kirov</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hubbard</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Family-based analysis of the contribution of rare and common genetic variants to school performance in schizophrenia</article-title><source>Mol Psychiatry</source><year>2023</year><month>05</month><volume>28</volume><issue>5</issue><fpage>2081</fpage><lpage>2087</lpage><pub-id pub-id-type="doi">10.1038/s41380-023-02013-2</pub-id><pub-id pub-id-type="medline">36914811</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wing</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Babor</surname><given-names>T</given-names> </name><name name-style="western"><surname>Brugha</surname><given-names>T</given-names> </name><etal/></person-group><article-title>SCAN. Schedules for clinical assessment in neuropsychiatry</article-title><source>Arch Gen Psychiatry</source><year>1990</year><month>06</month><volume>47</volume><issue>6</issue><fpage>589</fpage><lpage>593</lpage><pub-id pub-id-type="doi">10.1001/archpsyc.1990.01810180089012</pub-id><pub-id pub-id-type="medline">2190539</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="book"><article-title>Chapter V: mental and behavioural disorders (F00-F99)</article-title><source>International Statistical Classification of Diseases and Related Health Problems 10th Revision (ICD-10)</source><year>2019</year><access-date>2025-09-15</access-date><publisher-name>World Health Organization</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://icd.who.int/browse10/2019/en#/F70-F79">https://icd.who.int/browse10/2019/en#/F70-F79</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><etal/></person-group><article-title>HuggingFace&#x2019;s transformers: state-of-the-art natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 9, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.03771</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="web"><article-title>AleksEdwards/JMIR-june-25</article-title><source>GitHub</source><access-date>2025-09-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/AleksEdwards/JMIR-June-25">https://github.com/AleksEdwards/JMIR-June-25</ext-link></comment></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name></person-group><article-title>An evaluation of statistical approaches to text categorization</article-title><source>Inf Retr Boston</source><year>1999</year><month>04</month><volume>1</volume><issue>1-2</issue><fpage>69</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.1023/A:1009982220290</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Elbattah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Arnaud</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ghazali</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Dequen</surname><given-names>G</given-names> </name></person-group><article-title>Exploring the ethical challenges of large language models in emergency medicine: a comparative international review</article-title><conf-name>2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name><conf-date>Dec 3-6, 2024</conf-date><conf-loc>Lisbon, Portugal</conf-loc><fpage>5750</fpage><lpage>5755</lpage><pub-id pub-id-type="doi">10.1109/BIBM62325.2024.10822376</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>SYH</given-names> </name><name name-style="western"><surname>William</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Ethical and regulatory challenges of large language models in medicine</article-title><source>Lancet Digit Health</source><year>2024</year><month>06</month><volume>6</volume><issue>6</issue><fpage>e428</fpage><lpage>e432</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00061-X</pub-id><pub-id pub-id-type="medline">38658283</pub-id></nlm-citation></ref></ref-list></back></article>