<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e68020</article-id><article-id pub-id-type="doi">10.2196/68020</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Training Language Models for Estimating Priority Levels in Ultrasound Examination Waitlists: Algorithm Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Masayoshi</surname><given-names>Kanato</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Hashimoto</surname><given-names>Masahiro</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Toda</surname><given-names>Naoki</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mori</surname><given-names>Hirozumi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kobayashi</surname><given-names>Goh</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Haque</surname><given-names>Hasnine</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>So</surname><given-names>Mizuki</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jinzaki</surname><given-names>Masahiro</given-names></name><degrees>MD,PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Radiology, School of Medicine, Keio University</institution><addr-line>35 Shinanomachi, Shinjuku-ku, Tokyo 160-8582</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff2"><institution>GE Healthcare Japan</institution><addr-line>4-7-127, Asahigaoka</addr-line><addr-line>Hino, Tokyo</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Huo</surname><given-names>Yuankai</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Changyu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Chenyu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Masahiro Hashimoto, MD, Department of Radiology, School of Medicine, Keio University, 35 Shinanomachi, Shinjuku-ku, Tokyo 160-8582, Tokyo, Japan, 81 3-3353-1211 ext 62477; <email>m.hashimoto@rad.med.keio.ac.jp</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>22</day><month>7</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e68020</elocation-id><history><date date-type="received"><day>26</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>06</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>21</day><month>03</month><year>2025</year></date></history><copyright-statement>&#x00A9; Kanato Masayoshi, Masahiro Hashimoto, Naoki Toda, Hirozumi Mori, Goh Kobayashi, Hasnine Haque, Mizuki So, Masahiro Jinzaki. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 22.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e68020"/><abstract><sec><title>Background</title><p>Ultrasound examinations, while valuable, are time-consuming and often limited in availability. Consequently, many hospitals implement reservation systems; however, these systems typically lack prioritization for examination purposes. Hence, our hospital uses a waitlist system that prioritizes examination requests based on their clinical value when slots become available due to cancellations. This system, however, requires a manual review of examination purposes, which are recorded in free-form text. We hypothesized that artificial intelligence language models could preliminarily estimate the priority of requests before manual reviews.</p></sec><sec><title>Objective</title><p>This study aimed to investigate potential challenges associated with using language models for estimating the priority of medical examination requests and to evaluate the performance of language models in processing Japanese medical texts.</p></sec><sec sec-type="methods"><title>Methods</title><p>We retrospectively collected ultrasound examination requests from the waitlist system at Keio University Hospital, spanning January 2020 to March 2023. Each request comprised an examination purpose documented by the requesting physician and a 6-tier priority level assigned by a radiologist during the clinical workflow. We fine-tuned JMedRoBERTa, Luke, OpenCalm, and LLaMA2 under two conditions: (1) tuning only the final layer and (2) tuning all layers using either standard backpropagation or low-rank adaptation.</p></sec><sec sec-type="results"><title>Results</title><p>We had 2335 and 204 requests in the training and test datasets post cleaning. When only the final layers were tuned, JMedRoBERTa outperformed the other models (Kendall coefficient=0.225). With full fine-tuning, JMedRoBERTa continued to perform best (Kendall coefficient=0.254), though with reduced margins compared with the other models. The radiologist&#x2019;s retrospective re-evaluation yielded a Kendall coefficient of 0.221.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Language models can estimate the priority of examination requests with accuracy comparable with that of human radiologists. The fine-tuning results indicate that general-purpose language models can be adapted to domain-specific texts (ie, Japanese medical texts) with sufficient fine-tuning. Further research is required to address priority rank ambiguity, expand the dataset across multiple institutions, and explore more recent language models with potentially higher performance or better suitability for this task.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>clinical informatics</kwd><kwd>large language model</kwd><kwd>machine learning</kwd><kwd>health resources</kwd><kwd>ultrasonography</kwd><kwd>hospital information systems.</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Waitlist System</title><p>Ultrasound, a noninvasive imaging modality, enables real-time visualization of organs and blood flow and can be performed safely in pediatric and obstetric populations. However, imaging quality depends on the proficiency of the technician. Most hospitals implement reservation systems that allocate available slots to physicians due to a shortage of ultrasound technologists. Frequently, slots for the immediate future are fully booked, and these systems typically lack mechanisms for automatic urgency assessment.</p><p>Our hospital has implemented a waitlist system in case an appointment is canceled, and a slot becomes vacant. The system prioritizes examination requests based on urgency and clinical value. This approach facilitates more efficient use of canceled slots, reducing patient wait times, minimizing hospital stays, and improving overall care quality.</p><p>Our waitlist system organizes examination requests into 6 priority tiers, determined by board-certified radiologists based on the examination purpose, which is recorded as a brief free-text entry by the requesting physician (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The waitlist is accessible to all physicians, enabling them to anticipate when their orders might be processed. However, the delay in updating until radiologists complete their reviews has led to difficulties in providing real-time wait time estimates. Therefore, we investigated the potential of artificial intelligence (AI) language models to provide preliminary priority estimations.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Artificial intelligence-predicted priority levels will allow physicians to estimate waiting time before the official priority is determined by radiologists. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68020_fig01.png"/></fig></sec><sec id="s1-2"><title>Use of Language Models in Medicine</title><p>To perform this task, the AI models must process free-form text through natural language processing (NLP). NLP presents challenges due to the inherent ambiguity and complexity of natural languages. Historically, NLP approaches have used simplistic models, such as the bag-of-words method, which analyzes text as a mere collection of words without considering their order or contextual relationship. While this approach suffices for basic tasks, it does not adequately capture the intricacies of human language. Consequently, researchers have worked to incorporate linguistic insights into computational models to enhance their ability to process and understand natural language.</p><p>The advent of transformer architecture, particularly with bidirectional encoder representations from transformers (BERT), has revolutionized NLP [<xref ref-type="bibr" rid="ref1">1</xref>]. The ability of BERT to efficiently learn from extensive text corpora has significantly enhanced its contextual understanding and performance across various NLP tasks, minimizing strong inductive biases. BERT has also inspired the development of several transformer-based models tailored to specific domains, including medicine. Examples include BioBERT, ClinicalBERT, PubMedBERT, and BlueBERT [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Hence, we used JMedRoBERTa, a model specifically trained on a substantial corpus of Japanese medical research papers [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Large language models (LLMs), which often use architectures similar to BERT but with increased parameters and capabilities, particularly in text generation, have gained prominence. Empirical evidence from GPT-3 has demonstrated that scaling models improve performance, adhering to the scaling law in NLP [<xref ref-type="bibr" rid="ref7">7</xref>]. The term &#x201C;large&#x201D; is ambiguous, as BERT can also be considered an LLM. The introduction of ChatGPT [<xref ref-type="bibr" rid="ref8">8</xref>] and subsequent models, such as GPT-4 and PaLM (Pathways Language Model), has shown the success of LLMs across various fields, including medicine [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. Despite the proprietary nature of leading models due to high training costs and safety concerns, publicly available LLMs such as LLaMA2 and OpenCalm offer opportunities for research and evaluation of their potential and limitations [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>].</p></sec><sec id="s1-3"><title>Research Gap</title><p>The application of AI for priority estimation has been predominantly investigated in the context of emergency department (ED) triage [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. Several AI models use NLP techniques to analyze medical texts [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>], aiming to rank patients or requests to optimize the allocation of limited medical resources to those in urgent need. While these models have shown promise in improving resource allocation within ED, extending research into medical priority estimation beyond ED could further enhance patients&#x2019; quality of life, reduce hospital stay durations, and lower medical costs. Therefore, additional research is required to explore AI applications in medical priority estimation across various clinical settings.</p><p>This study provides valuable insights into both priority estimation and the broader field of medical NLP and LLM applications. Although LLMs have primarily been used for generative tasks, demonstrating innovative applications, these models underperform in scenarios requiring structured and predictable outputs. Such challenges are evident in health care settings, where integrating AI into hospital systems necessitates a high degree of precision and reliability that generative models do not consistently provide. Furthermore, current research on medical LLMs predominantly focuses on question-answering (QA) metrics [<xref ref-type="bibr" rid="ref9">9</xref>], overshadowing the exploration of LLM potential for non-QA tasks. Emphasizing LLM applications beyond QA could reveal new practical uses in medicine.</p><p>A significant challenge in applying LLMs to our context arises from the linguistic and contextual differences between the pretraining datasets, primarily in general English, and our specific use cases involving Japanese medical terminology. This mismatch impairs the model&#x2019;s understanding of specialized terms and complicates tokenization. Tokenizers, though less studied than model size and datasets, can significantly influence the performance of LLMs in non-English contexts [<xref ref-type="bibr" rid="ref20">20</xref>]. Our study addresses this issue by evaluating and enhancing LLMs&#x2019; linguistic and contextual adaptability for diverse clinical applications.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dataset</title><p>We retrospectively collected ultrasound examination requests from the waitlist system at Keio University Hospital (<xref ref-type="fig" rid="figure1">Figure 1</xref>) from January 2020 to March 2023. Each record comprised the requesting department, the examination slot, and the examination purpose documented by the requesting physician. In addition, records included a 6-tier priority level assigned by a board-certified radiologist during the clinical workflow, which served as the ground truth for the AI models. The criteria for determining priority levels are outlined in <xref ref-type="other" rid="box1">Textbox 1</xref>. Priority level 6 was excluded from the dataset due to its rarity (only a few records), and physicians typically communicated directly with radiologists in such cases.</p><boxed-text id="box1"><title> Criteria for priority levels</title><p>1) Desired before discharge if possible.</p><p>2) Required for treatment decisions.</p><p>3) Preferred early.</p><p>4) Urgently required.</p><p>5) Immediately required.</p><p>6) Emergency (excluded).</p></boxed-text><p>The dataset underwent 3 main preprocessing steps to ensure data quality and consistency: aggregation, cleaning, and text normalization.</p><sec id="s2-1-1"><title>Aggregation</title><p>Initially, records with similar request texts were aggregated using the Levenshtein distance metric, and the majority priority level was assigned to the representative record within each cluster. This aggregation was essential because the dataset contained approximately identical waitlist records for common ultrasound scenarios, such as postoperative monitoring or specific clinical pathways. Duplicates could skew sample weights during model training, and inconsistencies in priority levels could adversely affect accuracy. We aimed to reduce these risks and create a more uniform and reliable dataset for model training by aggregating similar records.</p></sec><sec id="s2-1-2"><title>Cleaning</title><p>This phase involved eliminating records unsuitable for analysis. Specifically, we excluded entries with zero or invalid priority levels because these could not contribute to meaningful priority estimation. In addition, we removed records with date-specific requests (eg, &#x201C;Can we schedule an ultrasound examination by May 3?&#x201D;) because temporal references could bias priority estimations and present challenges for AI models during prediction. This meticulous pruning ensured the remaining dataset was relevant and suitable for accurate modeling.</p></sec><sec id="s2-1-3"><title>Text Normalization</title><p>The final preprocessing step aimed to enhance textual consistency. We removed extraneous spaces and corrected punctuation errors by standardizing the text format across the dataset. This normalization was crucial for minimizing variability in model input and ensuring accurate text interpretation by AI.</p><p>After preprocessing, approximately 10% of the dataset was reserved for testing, with the remaining portion allocated for training. The dataset was divided based on referring doctors to ensure that requests from a single physician appeared exclusively in the training or test subset.</p></sec></sec><sec id="s2-2"><title>Models</title><p>We used several pretrained models: JMedRoBERTa, Luke, OpenCalm 7B, and LLaMA2 7B [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], all of which are accessible via Hugging Face (<xref ref-type="table" rid="table1">Table 1</xref>) [<xref ref-type="bibr" rid="ref22">22</xref>]. Both OpenCalm and LLaMA2 offer multiple variants with different model sizes; however, we selected the 7B model due to computational resource limitations. These 4 models were chosen based on their size and the semantic alignment between their pretraining datasets and our downstream task. Ideally, the optimal model should possess a large number of parameters and be trained on a dataset that aligns semantically and linguistically with the downstream task. However, there is often a trade-off between model size and dataset alignment. In this study, we experimented with models positioned at different points along this trade-off, providing valuable insights into how this balance can be managed for medical text classification tasks using LLMs.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Model details.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Number of Parameters</td><td align="left" valign="bottom">Language of training dataset</td><td align="left" valign="bottom">Category of training dataset</td></tr></thead><tbody><tr><td align="left" valign="top">JMedRoBERTa</td><td align="left" valign="top">124 million</td><td align="left" valign="top">Japanese</td><td align="left" valign="top">Medical paper</td></tr><tr><td align="left" valign="top">Luke</td><td align="left" valign="top">562 million</td><td align="left" valign="top">Japanese</td><td align="left" valign="top">Wikipedia</td></tr><tr><td align="left" valign="top">OpenCalm</td><td align="left" valign="top">7 billion</td><td align="left" valign="top">Japanese</td><td align="left" valign="top">Mixed<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">LLaMA2</td><td align="left" valign="top">7 billion</td><td align="left" valign="top">English (mainly)<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">Mixed<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Large language models are generally pretrained on diverse text data to maximize the use of their extensive parameters.</p></fn><fn id="table1fn2"><p><sup>b</sup>LLaMA2 was primarily designed for English, but its training dataset included some Japanese data.</p></fn></table-wrap-foot></table-wrap><p>To establish a performance baseline, we also tested conventional NLP methods: support vector machine, random forest, and XGBoost (eXtreme Gradient Boosting) [<xref ref-type="bibr" rid="ref23">23</xref>]. The same input text used for the LLMs was processed into a list of words with MeCab [<xref ref-type="bibr" rid="ref24">24</xref>], using the mecab-ipadic-NEologd dictionary [<xref ref-type="bibr" rid="ref25">25</xref>]. This list of words was then converted into a vector using the term frequency-inverse document frequency.</p><p>The model input adhered to the template provided in <xref ref-type="other" rid="box2">Textbox 2</xref>. We experimented with various prompts, ranging from simpler to more complex ones (such as role prompting or few-shot). Ultimately, we found that this simple prompting worked best for our task. We trained the models to predict the correct priority levels using continuous numbers (ie, regression). Training was conducted under 2 conditions: fine-tuning only the final layer and fine-tuning all layers. However, fine-tuning all layers was impractical due to the large number of parameters in OpenCalm and LLaMA2. Therefore, we used low-rank adaptation (LoRA) with parameter-efficient fine-tuning using <italic>r</italic>=32 [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. The models were optimized using the AdamW optimizer with &#x03B2;&#x2081;=0.9, &#x03B2;&#x2082;=0.999, and a weight decay of 0.0001. The loss function used was mean squared error. Learning rates were set to 1e-5 for final-layer fine-tuning and 1e-7 for full-layer or LoRA-based tuning. Training was performed over 100 epochs, using the NVIDIA RTX A6000 GPU.</p><boxed-text id="box2"><title> Record was fed into the models using the simple prompt.</title><p><bold>[Template]</bold></p><p><bold>Input (original</bold>): &#x8A3A;&#x7642;&#x79D1; : {&#x8A3A;&#x7642;&#x79D1;}; &#x691C;&#x67FB;&#x9805;&#x76EE; : {&#x691C;&#x67FB;&#x67A0;}; &#x4F9D;&#x983C;&#x76EE;&#x7684; : {&#x4F9D;&#x983C;&#x76EE;&#x7684;};</p><p><bold>Input (translated</bold>): Department: {department}; Examination Item: {slot type}; Purpose: {purpose};</p><p><bold>[Example]</bold></p><p><bold>Input (original</bold>): &#x8A3A;&#x7642;&#x79D1;: &#x4E00;&#x822C;&#x30FB;&#x6D88;&#x5316;&#x5668;&#x5916;&#x79D1;; &#x691C;&#x67FB;&#x9805;&#x76EE;: &#x672B;&#x68A2;&#x8840;&#x7BA1; (&#x4E21;&#xFF09;&#x4E0B;&#x80A2;&#x9759;&#x8108;; &#x4F9D;&#x983C;&#x76EE;&#x7684;: &#x809D;&#x7D30;&#x80DE;&#x764C;&#x8853;&#x5F8C;&#xFF24;&#x2212;&#xFF44;&#xFF49;&#xFF4D;&#xFF45;&#xFF52;&#x4E0A;&#x6607;&#x3042;&#x308A;&#x7CBE;&#x67FB;&#x76EE;&#x7684;&#x3067;&#x3059;;</p><p><bold>Input (translated</bold>): Department: General and gastrointestinal surgery; Examination Item: (bilateral) veins of lower extremities; Purpose: After hepatocellular carcinoma surgery, D-dimer elevated. Needs further inspection.</p><p><bold>Priority level</bold>: 2</p></boxed-text></sec><sec id="s2-3"><title>Evaluation</title><p>Kendall tau-b, the rank correlation coefficient, was used as the primary evaluation metric. While root-mean-squared error is a common metric for regression tasks, measuring the distance between predicted and actual values, our focus on accurately estimating the priority order for medical examinations made the alignment between predicted and actual rankings crucial. Therefore, Kendall tau-b was preferred, emphasizing the significance of ordinal relationships over quantitative discrepancies. In addition, we created the confusion matrices by rounding the continuous prediction values to the nearest integers.</p><p>In addition, we assessed the ability of the model to identify low priority (priority level=1) and high priority (priority level&#x003E;=4) requests. Performance in this classification task was quantitatively assessed using the area under the receiver operating characteristic curve (ROC-AUC) and the <italic>F</italic><sub>1</sub>-score. The thresholds were optimized individually for each classification task.</p><p>Finally, we compared the accuracy of the language models with a retrospective re-evaluation performed by a radiologist. A board-certified radiologist (MH) assigned priority levels to all records in the test dataset based solely on the same text presented to the AI models. This comparison served as a benchmark for our model&#x2019;s predictions and provided valuable insights into the challenges and consistency involved in priority assignment.</p></sec><sec id="s2-4"><title>Error Analysis</title><p>We analyzed instances where the model&#x2019;s predictions deviated from the actual priority levels to identify potential biases and causes of errors. The error analysis was conducted on the model that achieved the best performance, as indicated by the highest Kendall score. We extracted all samples from the test dataset with an absolute error of &#x003E;1. These errors were classified as either overestimations or underestimations. Each mispredicted sample was reviewed to determine the underlying patterns or common characteristics contributing to the discrepancies. Discrepancies between the original and re-evaluation radiologist ratings were also investigated to assess the difficulty and consistency of priority estimation. In addition, we used the Shapley Additive Explanation (SHAP) [<xref ref-type="bibr" rid="ref28">28</xref>] method to visualize the importance scores of each token in the input.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>This study received approval from the Research Ethics Committee of Keio University Hospital (approval number: 20170018) and adhered to the Declaration of Helsinki and other pertinent ethical guidelines. All patient data were processed on the machine located inside the hospital&#x2019;s secure intranet, isolated from the public internet, thereby ensuring participant privacy and confidentiality. The requirement for written informed consent was waived due to the retrospective observational nature of the study. The output of the AI models did not influence actual clinical practice.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Dataset</title><p>The initial dataset comprised 3654 ultrasound examination requests. After text similarity aggregation and data cleaning, the final dataset consisted of 2539 records (<xref ref-type="fig" rid="figure2">Figure 2</xref>). These were further divided into training and test datasets of 2335 and 204 records, respectively, ensuring that requests from each referring doctor appeared exclusively in either training or test subset to prevent data leakage and maintain evaluation integrity. The distribution of assigned priority levels is depicted in <xref ref-type="fig" rid="figure3">Figure 3</xref>. Most requests were assigned priority levels 2 or 3, while requests at priority level 5 were extremely rare. The distribution of priority levels did not exhibit significant skewness despite variability in the number of requests reviewed by each radiologist.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Dataset flowchart.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68020_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>The majority of orders are assigned to priority levels 2 or 3 with little skewness between radiologists.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68020_fig03.png"/></fig></sec><sec id="s3-2"><title>Evaluation</title><p><xref ref-type="table" rid="table2">Table 2</xref> and <xref ref-type="fig" rid="figure4">Figure 4</xref> show the performance of each model across different metrics. As expected, fully fine-tuned models outperformed the final-layer fine-tuned models. In particular, the fully fine-tuned JMedRoBERTa achieved the highest Kendall tau-b of 0.254. All the fully fine-tuned LLMs surpassed the baseline of conventional models, and notably, they also outperformed the radiologist re-evaluation in terms of Kendall tau-b. However, this result should be interpreted with caution, as it may reflect the inherent ambiguity of the priority estimation task, a topic that will be further discussed later. We observed that training all layers or using LoRA not only improved accuracy across all models but also narrowed the performance disparity between JMedRoBERTa and the other models. Regarding the classification tasks, JMedRoBERTa and Luke performed well, with ROC-AUC ranging from approximately 0.75 to 0.8.</p><p>For model-specific prediction trends, the JMedRoBERTa predictions (<xref ref-type="fig" rid="figure5">Figures 5A</xref> and <xref ref-type="fig" rid="figure5">B</xref>) revealed a trend where the distribution of AI-predicted values shifted upward as the actual priority level increased, indicating a positive correlation between AI predictions and the radiologists&#x2019; assessments.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance metrics.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Model and fine-tuned layers</td><td align="left" valign="bottom">Regression</td><td align="left" valign="bottom" colspan="2">Low priority classification</td><td align="left" valign="bottom" colspan="2">High priority classification</td></tr><tr><td align="left" valign="bottom">Kendall</td><td align="left" valign="bottom">ROC-AUC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom">ROC-AUC</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6">JMedRoBERTa</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Final</td><td align="left" valign="top">0.225</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.25</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>All</td><td align="left" valign="top">0.254</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.50</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.29</td></tr><tr><td align="left" valign="top" colspan="6">Luke</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Final</td><td align="left" valign="top">0.170</td><td align="left" valign="top">0.65</td><td align="left" valign="top">0.24</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.30</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>All</td><td align="left" valign="top">0.236</td><td align="left" valign="top">0.82</td><td align="left" valign="top">0.45</td><td align="left" valign="top">0.74</td><td align="left" valign="top">0.36</td></tr><tr><td align="left" valign="top" colspan="6">LLaMA2-7b</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Final</td><td align="left" valign="top">0.197</td><td align="left" valign="top">0.72</td><td align="left" valign="top">0.31</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.23</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LoRA<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.231</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.26</td></tr><tr><td align="left" valign="top" colspan="6">OpenCalm</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Final</td><td align="left" valign="top">0.180</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.20</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.31</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LoRA<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0.242</td><td align="left" valign="top">0.65</td><td align="left" valign="top">0.23</td><td align="left" valign="top">0.76</td><td align="left" valign="top">0.25</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">0.167</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.27</td><td align="left" valign="top">0.49</td><td align="left" valign="top">0.08</td></tr><tr><td align="left" valign="top">Random forest</td><td align="left" valign="top">0.198</td><td align="left" valign="top">0.63</td><td align="left" valign="top">0.25</td><td align="left" valign="top">0.48</td><td align="left" valign="top">0.14</td></tr><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">0.176</td><td align="left" valign="top">0.60</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.23</td></tr><tr><td align="left" valign="top">Radiologist re-evaluation</td><td align="left" valign="top">0.221</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.31</td><td align="left" valign="top">0.62</td><td align="left" valign="top">0.20</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>ROC-AUC: area under the receiver operating characteristic curve.</p></fn><fn id="table2fn2"><p><sup>b</sup>It should be noted that these were highly imbalanced classification tasks, and therefore, <italic>F</italic><sub>1</sub>-score tends to be lower. (A completely random classifier would yield an <italic>F</italic><sub>1</sub>-score of around 0.1).</p></fn><fn id="table2fn3"><p><sup>c</sup>Low-rank adaptation (r=32).</p></fn><fn id="table2fn4"><p><sup>d</sup>SVM: support vector machine.</p></fn><fn id="table2fn5"><p><sup>e</sup>XGBoost: extreme gradient boosting.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>All layers or low-rank adaptation fine-tuning improves accuracy in all models, narrowing the performance gap between the medical language model and other general-purpose language models. LoRA: low-rank adaptation; ROC-AUC: area under the receiver operating characteristic curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68020_fig04.png"/></fig><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>JMedRoBERTa performance. (A) The distribution of priority levels predicted by the fine-tuned JMedRoBERTa model was mostly consistent with the radiologist rating except for confusion between priority levels 2 and 3. (B) Confusion matrix also shows that the model was primarily confused by priority levels 2 and 3. (C) The model detected low (&#x003C;=1) or high (&#x003E;=4) priority orders at an ROC-AUC of around 0.8. LoRA: low-rank adaptation; ROC-AUC: area under the receiver operating characteristic curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68020_fig05.png"/></fig><p>A total of 39 error cases (absolute error&#x003E;1.0) were identified, including 25 overestimated and 14 underestimated cases. The most common misclassification was confusion between priority levels 2 and 3, which was also observed in the radiologist re-evaluation (<xref ref-type="fig" rid="figure6">Figure 6</xref>). The tendency of errors made by radiologists and AI was similar (<xref ref-type="fig" rid="figure7">Figure 7</xref>). A more detailed analysis of these errors follows in the &#x201C;Discussion&#x201D; section.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Radiologist re-evaluation performance. Even a radiologist struggled in replicating priority level 2 and 3. ROC: receiver operating characteristic.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68020_fig06.png"/></fig><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>The model and radiologist tend to make similar types of errors, as seen in the upper left and lower right cells. Underestimation and overestimation are defined as a deviation of more than one level from the original radiologist rating.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68020_fig07.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study used language models to predict priority levels for an ultrasound examination waitlist system. JMedRoBERTa, pretrained on a Japanese medical paper dataset, demonstrated the highest performance. Other models also performed comparably when fully fine-tuned or adjusted with LoRA. This section discusses a comparison of the performance of different models, focusing on domain and language adaptations. After that, the challenges of prioritizing tasks due to the variability of priority levels are addressed. Subsequently, the nature of the error samples is discussed, followed by the ethical and social implications. Finally, the limitations of the study and its conclusions are presented.</p></sec><sec id="s4-2"><title>Domain and Language Adaptation</title><p>Comparing the performances of the models provides insights into the influence of model size, pretraining datasets, and fine-tuning methods on the cross-domain and cross-language adaptation capabilities of LLMs. The critical factor influencing performance in this experiment was the alignment between the pretraining dataset and the downstream task. JMedRoBERTa, pretrained on a Japanese medical paper dataset, achieved superior performance despite having the smallest model size. JMedRoBERTa could focus exclusively on learning the priority assignment rules, while other models had to contend with both unfamiliar vocabulary and priority assignment rules.</p><p>However, this observation may change as the number of model parameters increases. Models pretrained on nonmedical and non-Japanese data may benefit relatively more from a larger number of training samples. In particular, as the dataset size grows, a model&#x2019;s representational capacity (roughly reflected by the number of parameters) may become a more dominant factor than the similarity of its pretraining dataset, as sufficient data would enable such models to adapt to the downstream task domain.</p><p>Meanwhile, LoRA reduced the performance gap between domain-specific and general language models. Final-layer fine-tuning can be viewed as similar to zero-shot learning, as it only updates the final layer, which primarily serves to format the output from internal representations rather than contributing to text comprehension. In contrast, fine-tuning all layers, rather than just the final one, allowed the models to better adapt to the specific domain and language of the downstream task. Although full-parameter fine-tuning theoretically offers superior performance than LoRA [<xref ref-type="bibr" rid="ref29">29</xref>], it often remains impractical due to constraints in computational resources (primarily memory capacity). Consequently, parameter-efficient tuning remains crucial for applying LLMs to medical tasks.</p><p>While our study used LoRA for fine-tuning all layers, there are other parameter-efficient tuning methods. For example, Sukeda et al [<xref ref-type="bibr" rid="ref30">30</xref>] highlighted LoRA instruction tuning as a promising approach for adapting LLMs to Japanese medical QA tasks [<xref ref-type="bibr" rid="ref30">30</xref>]. In addition, quantization is a popular technique that significantly reduces memory requirements while maintaining performance [<xref ref-type="bibr" rid="ref31">31</xref>]. Our findings support the effectiveness of parameter-efficient fine-tuning, demonstrating that general-purpose LLMs can achieve capabilities comparable to those of fully fine-tuned domain-specific models.</p><p>The influence of tokenization on task performance was minimal. Only the JMedRoBRETa tokenizer could accurately recognize medical terms. Conversely, the Luke tokenizer recognized only nonmedical Japanese words, often splitting medical terms into multiple tokens. The other 2 tokenizers failed to process most Japanese characters correctly, resorting to byte fallback, where single characters were segmented into multiple tokens based on their Unicode representation. However, all models delivered comparable performances when fine-tuned. Since LoRA tuning does not change the tokenizer, it is suggested that the tokenization quality minimally affects the performance of this specific downstream task.</p></sec><sec id="s4-3"><title>Challenges in Reproducing Priority Assignments</title><p>Although the AI model outperformed the radiologist re-evaluation, this result does not necessarily indicate the superiority of LLMs in priority estimation. Instead, it highlights the inherent ambiguity of the task itself. The priority levels were originally assigned by board-certified radiologists with sufficient clinical experience; however, the relatively low interrater agreement suggests that the process is inherently subjective.</p><p>Priority assignments are influenced by various factors, some of which are only available in the real-time clinical setting, leading to discrepancies between the original and re-evaluation ratings. For instance, the number of pending orders, the availability of examination equipment, and seasonal variations such as holidays can impact decision-making. In addition, in urgent cases, physicians may directly consult radiologists or the examination department, a factor that will not be captured in the dataset used for AI training or the radiologist re-evaluation. Consequently, even experienced radiologists may find it challenging to precisely reproduce the original priority levels in a retrospective setting, and AI models face similar challenges.</p><p>To mitigate this, cases influenced by external factors should be excluded from the training dataset, with radiologists allowed to flag them. Also, enhancing request records with supplementary information would improve reproducibility. For instance, radiologists could annotate the reasons behind priority decisions, enabling AI models to learn their decision-making processes. Providing AI with comprehensive clinical notes could enrich the contextual information. While reviewing all patient charts to determine priority is impractical for humans, AI language models can process extensive text rapidly. This capability might enable AI models to exceed human performance in priority estimation.</p></sec><sec id="s4-4"><title>Error Analysis</title><p>There were 25 overestimated cases and 14 underestimated cases. The model errors can stem from 2 main sources, which are the inherent difficulty of replicating the assignment and the limitations of the model. As described previously, replicating radiologists&#x2019; priority assignments made in the clinical setting is inherently challenging, and both AI models and radiologists are affected by this uncertainty. In fact, as demonstrated in <xref ref-type="fig" rid="figure7">Figure 7</xref>, the model and the radiologist re-evaluation exhibited similar patterns of misclassification, with no instances where the model greatly overestimated while the radiologist greatly underestimated, or vice versa. This observation suggests that certain underlying factors (ie, inherent difficulty) may be causing both the model and the radiologist to make similar errors.</p><p>To investigate the error cases further, we examined which parts of the input text contributed most to the model&#x2019;s predictions using SHAP. However, interpreting SHAP values in transformer-based models presents certain challenges. Since these models capture contextual relationships more holistically than conventional approaches, SHAP values do not always highlight clinically meaningful tokens. The same word can have highly varying SHAP values in different inputs, and common tokens appearing in all samples may absorb baseline importance, leading to misleading attributions. To partly mitigate this, we adjusted SHAP calculations to reduce the influence of shared tokens and improve interpretability. Despite these limitations of SHAP in our context, some cases yielded meaningful insights. We present the representative cases in <xref ref-type="other" rid="box3">Textbox 3</xref>, and we will discuss each case below.</p><p>Despite these limitations of SHAP in our context, some cases yielded meaningful insights. We present the representative cases in <xref ref-type="other" rid="box3">Textbox 3</xref>, and we will discuss each case below.</p><boxed-text id="box3"><title> High-Shapley Additive Explanation tokens are shown in bold and underlined. Some cases exhibited insightful Shapley Additive Explanation values, demonstrating the model&#x2019;s focus on key terms or revealing sources of misprediction.</title><p>[Sample 1 (Original: 3, Re-evaluation: 4, AI prediction: 3.784)]</p><p>[Input (Japanese)] &#x8A3A;&#x7642;&#x79D1; : &#x6574;&#x5F62;&#x5916;&#x79D1;; &#x691C;&#x67FB;&#x9805;&#x76EE; : &#x8179;&#x90E8;<bold><underline>&#x4E0A;&#x8179;&#x90E8;</underline></bold>; &#x4F9D;&#x983C;&#x76EE;&#x7684; : <bold><underline>&#x80C6;&#x56A2;&#x708E;</underline></bold>&#x7591;&#x3044;</p><p>[Translated] Department: Orthopedics; Examination Item: abdomen, <bold><underline>upper abdomen</underline></bold>; Purpose: <bold><underline>Cholecystitis</underline></bold> is suspected.</p><p>[Sample 2 (Original: 4, Re-evaluation: 3, AI prediction: 3.634)]</p><p>[Input (Japanese)] &#x8A3A;&#x7642;&#x79D1; : <bold><underline>&#x7523;&#x5A66;&#x4EBA;&#x79D1;</underline></bold>; &#x691C;&#x67FB;&#x9805;&#x76EE; : &#x8179;&#x90E8; &#x4E0A;&#x8179;&#x90E8;; &#x4F9D;&#x983C;&#x76EE;&#x7684; : <bold><underline>&#x598A;&#x5A20;</underline></bold> 15 &#x9031;&#x4EA4;&#x901A;&#x4E8B;&#x6545;&#x30B7;&#x30FC;&#x30C8;&#x30D9;&#x30EB;&#x30C8;&#x75D5;&#x3042;&#x308A;&#x809D;&#x6A5F;&#x80FD;&#x5FAE;&#x5897;<bold><underline>&#x3057;&#x3066;&#x304A;&#x308A;, &#x809D;&#x640D;&#x50B7;</underline></bold>&#x7591;&#x3063;&#x3066;&#x304A;&#x308A;&#x307E;&#x3059; FAST &#x306F;&#x9670;&#x6027;&#x3067;&#x3059;&#x304C;, &#x53F3;&#x5074;&#x80F8;&#x90E8;&#x306E;&#x81EA;&#x767A;&#x75DB;&#x3042;&#x308A;&#x307E;&#x3059;&#x5FA1;&#x9AD8;&#x8A3A;&#x304A;&#x9858;&#x3044;&#x3057;&#x307E;&#x3059;</p><p>[Translated] Department: <bold><underline>OBGYN</underline></bold>; Examination Item: abdomen, upper abdomen; Purpose: A 15-week <bold><underline>pregnant</underline></bold> traffic accident with a seatbelt mark. Liver enzymes are mildly elevated, <bold><underline>and liver injury</underline></bold> is suspected. FAST is negative, but she complains of spontaneous pain in the right side of her chest.</p><p>[Sample 3 (Original: 3, Re-evaluation: 2, AI prediction: 4.070)]</p><p>[Input (Japanese)] &#x8A3A;&#x7642;&#x79D1; : &#x5FC3;&#x81D3;&#x8840;&#x7BA1;&#x5916;&#x79D1;; &#x691C;&#x67FB;&#x9805;&#x76EE; : &#x9838;&#x90E8;&#x30FB;&#x7532;&#x72B6;&#x817A;&#x30FB;&#x9670;<bold>&#x56A2;</bold>&#x30FB;&#x305D;&#x306E;&#x4ED6;&#x8868;&#x5728; &#x981A;&#x52D5;&#x8108;&#x30C9;&#x30C3;&#x30D7;&#x30E9;&#x30FC;; &#x4F9D;&#x983C;&#x76EE;&#x7684; : &#x4E0B;&#x884C;&#x5927;&#x52D5;&#x8108;&#x7624;<bold><underline>&#x7834;&#x88C2;</underline></bold>&#x5F8C;, &#x4EEE;&#x6027;<bold><underline>&#x7624;&#x7591;&#x3044;</underline></bold>&#x3002;<underline><bold>&#x8853;&#x524D;&#x8A55;&#x4FA1;</bold></underline>&#x3067;&#x3059;&#x3002;</p><p>[Translated] Department: Cardiovascular surgery; Examination Item: Thyroid, Scrotum, and Other Superficial Structures / Carotid Doppler; Purpose: <bold><underline>Suspected pseudoaneurysm</underline></bold> following a <bold><underline>ruptured</underline></bold> descending aortic aneurysm. <bold><underline>Preoperative evaluation</underline></bold>.</p></boxed-text><sec id="s4-4-1"><title>Sample 1: Acute Cholecystitis Suspicion</title><p>The AI model correctly assigned a high priority to a case of suspected acute cholecystitis, focusing on the keyword &#x201C;cholecystitis.&#x201D; Given that ultrasound is the effective diagnostic tool for this condition and that surgical intervention may be required promptly, this prioritization aligns well with clinical expectations. This example demonstrates that when a request contains an explicit keyword suggesting a critical condition, the model can effectively capture its importance.</p></sec><sec id="s4-4-2"><title>Sample 2: Traumatic Liver Injury in a Pregnant Person</title><p>For a pregnant patient involved in a motor vehicle accident with concerns of hepatic injury, the model was also assigned a high priority. SHAP analysis revealed that the model placed significant weight on the terms &#x201C;pregnancy&#x201D; and &#x201C;liver injury,&#x201D; suggesting that it successfully incorporated both the trauma and the patient&#x2019;s physiological condition into its decision-making. The model&#x2019;s ability to recognize such contextual factors is encouraging.</p></sec><sec id="s4-4-3"><title>Sample 3: Preoperative Evaluation for Aortic Aneurysm</title><p>In this case, a carotid Doppler was requested for the preoperative evaluation of a pseudoaneurysm following a ruptured aortic aneurysm. While &#x201C;aneurysm&#x201D; and &#x201C;rupture&#x201D; typically indicate urgency, this patient appears to be stable, and the surgery is scheduled rather than urgent. If this were an urgent surgical case, it would be unlikely for the doctor to request an ultrasound examination from the radiology department. In fact, radiologists assigned a midrange priority of 2 and 3, reflecting the nonemergent nature of the request. However, the model assigned a priority of 4, overestimating the urgency. This suggests that the model may sometimes overprioritize cases based on emergency-associated keywords without fully considering the clinical context.</p><p>Overall, SHAP analysis indicates that the model performs well in straightforward cases where the primary pathology is explicitly mentioned but struggles with nuanced clinical scenarios requiring deeper contextual understanding.</p></sec></sec><sec id="s4-5"><title>Clinical Implementation&#x2014;Benefits</title><p>The current waitlist system already provides several clinical and operational advantages. Integrating AI could further enhance its efficiency by addressing the key limitations of ensuring rapid, fair, and consistent priority assignment. This section examines the benefits of the waitlist system and the role of AI in priority estimation separately.</p><p>A priority-based waitlist system offers multiple benefits. First, it improves clinical outcomes by facilitating faster ultrasound examinations for urgent cases, enabling timely clinical decisions. In addition, it can shorten hospitalization durations, especially when ultrasound examinations are critical for determining discharge eligibility. By increasing the transparency of the examination scheduling process, this system helps physicians estimate examination dates more accurately, thereby improving planning. Furthermore, effective prioritization supports better bed management and overall hospital efficiency, allowing for higher patient turnover and boosting institutional revenue.</p><p>Despite these advantages, the existing manual priority assignment process presents several challenges. Radiologists face an increased workload due to the need for subjective prioritization, leading to delays in determining priority levels. Furthermore, inconsistencies may arise from variations in clinical judgment, making prioritization less reliable.</p><p>AI offers a promising solution by automating the priority assignment process. AI models can deliver consistent, real-time estimations, improving the accuracy and objectivity of the waitlist system. By streamlining this process, AI can reduce the burden on radiologists and enhance both efficiency and standardization.</p></sec><sec id="s4-6"><title>Clinical Implementation&#x2014;Challenges</title><p>However, implementing AI alone does not address all challenges. Several critical factors must be considered to ensure the successful clinical adoption of AI-assisted waitlist systems.</p><p>For AI-driven prioritization to be effectively integrated into clinical workflows, health care providers must be well-informed about its benefits and limitations to foster trust in the technology. While existing research shows a generally positive attitude toward AI in medicine [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], perceptions vary depending on the underlying technology, medical specialty, and cultural background [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. For instance, the term &#x201C;AI&#x201D; encompasses a broad spectrum of technologies, from simple symptom checkers [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>] to sophisticated LLMs [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Case studies that highlight the potential and challenges of medical AI applications will facilitate dialogue among stakeholders and accelerate the acceptance of AI in clinical practice [<xref ref-type="bibr" rid="ref40">40</xref>].</p><p>The potential for clinically significant misclassifications remains a concern. If an urgent case is mistakenly assigned a lower priority, it could result in adverse patient outcomes. Even in situations where human evaluators might also struggle with classification, unclear responsibility could raise legal and ethical concerns regarding liability in medical decision-making.</p><p>AI models are trained on historical data, which may contain biases related to patient demographics, socioeconomic status, or institutional practices. If these biases are not addressed, they could lead to disparities in priority assignment. However, AI also offers the potential to mitigate human biases by providing consistent, data-driven prioritization. Identifying and minimizing biases through rigorous model evaluation is crucial to ensuring fairness and equity.</p><p>Integrating AI into existing hospital information systems, such as electronic medical records and order management platforms, requires substantial technical modifications. Furthermore, the costs associated with implementing, maintaining, and updating AI models may pose financial constraints for health care institutions. Assessing the cost-effectiveness and feasibility of AI adoption is critical to ensuring widespread integration.</p><p>In summary, incorporating AI into priority-based waitlist systems can enhance clinical efficiency, reduce physician workload, and improve patient care. However, addressing concerns related to user acceptance, legal and ethical responsibility, potential biases, and system integration is essential for successful implementation. Future research should focus on strategies to overcome these challenges while maximizing AI&#x2019;s clinical use in resource allocation.</p></sec><sec id="s4-7"><title>Limitations</title><p>The primary limitation of this study is its focus on a single institution, which limits the external validity of the findings. Applying our model to other institutions or medical contexts would likely require retraining, as hospitals vary in specialty composition, resource allocation, and priority assessment criteria, all of which could influence model predictions. In addition, the dataset is restricted to Japanese text. Future research should aim to incorporate datasets from multiple institutions and languages. While this may present challenges due to variations in clinical practices and priority criteria, addressing these issues is crucial for evaluating the model&#x2019;s robustness and generalizability. Pretraining on a sufficiently large and diverse dataset could facilitate adaptation to new institutions with minimal effort.</p></sec><sec id="s4-8"><title>Conclusions</title><p>This study demonstrates that language models can estimate examination request priorities with accuracy comparable to human radiologists and better than conventional NLP methods. Nevertheless, improvements in the reproducibility of priority rankings are required. The research also highlights the potential for adapting general-purpose models to domain-specific text through adequate fine-tuning, underscoring the flexibility and applicability of these models in specialized contexts. Further research should explore methods to address the ambiguity in priority assignment and validate the model&#x2019;s performance across multiple institutions.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>bidirectional encoder representations from Transformers</p></def></def-item><def-item><term id="abb3">ED</term><def><p>emergency department</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb6">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb7">QA</term><def><p>question answering</p></def></def-item><def-item><term id="abb8">ROC-AUC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb9">SHAP</term><def><p>Shapley additive explanations</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><conf-name>Proceedings of the 31st International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 4-9, 2017</conf-date><conf-loc>Long Beach, CA</conf-loc><fpage>6000</fpage><lpage>6010</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3295222.3295349">https://dl.acm.org/doi/10.5555/3295222.3295349</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Oct 18, 2019</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1901.08746">http://arxiv.org/abs/1901.08746</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Altosaar</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ranganath</surname><given-names>R</given-names> </name></person-group><article-title>ClinicalBERT: modeling clinical notes and predicting hospital readmission</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Nov 29, 2020</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1904.05342">http://arxiv.org/abs/1904.05342</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tinn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Sep 16, 2021</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2007.15779">http://arxiv.org/abs/2007.15779</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>Transfer learning in biomedical natural language processing: an evaluation of BERT and elmo on ten benchmarking datasets</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 18, 2019</comment><pub-id pub-id-type="doi">10.18653/v1/W19-5006</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sugimoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Iki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chida</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kanazawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Aizawa</surname><given-names>A</given-names> </name></person-group><article-title>JMedRoBERTa: a japanese pre-trained language model on academic articles in medical sciences (in japanese)</article-title><access-date>2025-07-14</access-date><conf-name>Proceedings of the 29th Annual Meeting of the Association for Natural Language Processing 2023</conf-name><conf-date>Mar 13-17, 2023</conf-date><conf-loc>Okinawa, Japan</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://www.anlp.jp/proceedings/annual_meeting/2023/pdf_dir/P3-1.pdf">https://www.anlp.jp/proceedings/annual_meeting/2023/pdf_dir/P3-1.pdf</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><access-date>2025-07-14</access-date><conf-name>Proceedings of the 34th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><conf-loc>Virtual event</conf-loc><fpage>1877</fpage><lpage>1901</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/abs/10.5555/3495724.3495883">https://dl.acm.org/doi/abs/10.5555/3495724.3495883</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ouyang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Training language models to follow instructions with human feedback</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Mar 4, 2022</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2203.02155">http://arxiv.org/abs/2203.02155</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>A survey of large language models in medicine: progress, application, and challenges</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Jul 22, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2311.05112">http://arxiv.org/abs/2311.05112</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Towards expert-level medical question answering with large language models</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  May 16, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2305.09617">http://arxiv.org/abs/2305.09617</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lavril</surname><given-names>T</given-names> </name><name name-style="western"><surname>Izacard</surname><given-names>G</given-names> </name><etal/></person-group><article-title>LLaMA: open and efficient foundation language models</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Feb 27, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2302.13971">http://arxiv.org/abs/2302.13971</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Jul 19, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2307.09288">http://arxiv.org/abs/2307.09288</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>CyberAgent, Inc</collab></person-group><source>cyberagent/open-calm-7b</source><year>2023</year><access-date>2023-08-25</access-date><publisher-name>Hugging Face</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/cyberagent/open-calm-7b">https://huggingface.co/cyberagent/open-calm-7b</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Riboli-Sasco</surname><given-names>E</given-names> </name><name name-style="western"><surname>El-Osta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alaa</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Triage and diagnostic accuracy of online symptom checkers: systematic review</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>2</day><volume>25</volume><fpage>e43803</fpage><pub-id pub-id-type="doi">10.2196/43803</pub-id><pub-id pub-id-type="medline">37266983</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>CYK</given-names> </name><name name-style="western"><surname>Zack</surname><given-names>T</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>BY</given-names> </name><etal/></person-group><article-title>Use of a large language model to assess clinical acuity of adults in the emergency department</article-title><source>JAMA Netw Open</source><year>2024</year><month>05</month><day>1</day><volume>7</volume><issue>5</issue><fpage>e248895</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.8895</pub-id><pub-id pub-id-type="medline">38713466</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stewart</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Goudie</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Applications of natural language processing at emergency department triage: a narrative review</article-title><source>PLoS ONE</source><year>2023</year><volume>18</volume><issue>12</issue><fpage>e0279953</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0279953</pub-id><pub-id pub-id-type="medline">38096321</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spasic</surname><given-names>I</given-names> </name><name name-style="western"><surname>Button</surname><given-names>K</given-names> </name></person-group><article-title>Patient triage by topic modeling of referral letters: feasibility study</article-title><source>JMIR Med Inform</source><year>2020</year><month>11</month><day>6</day><volume>8</volume><issue>11</issue><fpage>e21252</fpage><pub-id pub-id-type="doi">10.2196/21252</pub-id><pub-id pub-id-type="medline">33155985</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>KC</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>LC</given-names> </name></person-group><article-title>A novel deep learning-based system for triage in the emergency department using electronic medical records: retrospective cohort study</article-title><source>J Med Internet Res</source><year>2021</year><month>12</month><day>27</day><volume>23</volume><issue>12</issue><fpage>e27008</fpage><pub-id pub-id-type="doi">10.2196/27008</pub-id><pub-id pub-id-type="medline">34958305</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fromm</surname><given-names>M</given-names> </name><name name-style="western"><surname>Thellmann</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Tokenizer choice for LLM training: negligible or crucial?</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Mar 17, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2310.08754">http://arxiv.org/abs/2310.08754</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yamada</surname><given-names>I</given-names> </name><name name-style="western"><surname>Asai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shindo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Takeda</surname><given-names>H</given-names> </name><name name-style="western"><surname>Matsumoto</surname><given-names>Y</given-names> </name></person-group><article-title>LUKE: deep contextualized entity representations with entity-aware self-attention</article-title><conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>Nov 16-12, 2020</conf-date><conf-loc>Online</conf-loc><fpage>6442</fpage><lpage>6454</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/2020.emnlp-main">https://www.aclweb.org/anthology/2020.emnlp-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.523</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wolf</surname><given-names>T</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sanh</surname><given-names>V</given-names> </name><etal/></person-group><article-title>HuggingFace&#x2019;s transformers: state-of-the-art natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 14, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.03771</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kudo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yamamoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Matsumoto</surname><given-names>Y</given-names> </name></person-group><article-title>Applying conditional random fields to japanese morphological analysis</article-title><conf-name>Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>2004</conf-date><conf-loc>Barcelona, Spain</conf-loc><fpage>230</fpage><lpage>237</lpage></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sato</surname><given-names>T</given-names> </name><name name-style="western"><surname>Okumura</surname><given-names>M</given-names> </name></person-group><article-title>Operation of a word segmentation dictionary generation system called neologd (in japanese)</article-title><conf-name>Information Processing Society of Japan, Special Interest Group on Natural Language Processing (IPSJ-SIGNL) Information Processing Society of Japan</conf-name><conf-date>Dec 20-22, 2016</conf-date><conf-loc>Tokyo</conf-loc></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Oct 16, 2021</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2106.09685">http://arxiv.org/abs/2106.09685</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Mangrulkar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gugger</surname><given-names>S</given-names> </name><name name-style="western"><surname>Debut</surname><given-names>L</given-names> </name><name name-style="western"><surname>Belkada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Paul</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bossan</surname><given-names>B</given-names> </name></person-group><article-title>PEFT: state-of-the-art parameter-efficient fine-tuning methods</article-title><source>GitHub</source><year>2022</year><access-date>2025-07-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/huggingface/peft">https://github.com/huggingface/peft</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lundberg</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SI</given-names> </name></person-group><article-title>A unified approach to interpreting model predictions</article-title><access-date>2025-07-14</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 4-9, 2017</conf-date><conf-loc>Long Beach, CA</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2017/file/8a20a8621978632d76c43dfd28b67767-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Christophe</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kanithi</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Munjal</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Med42 -- evaluating fine-tuning strategies for medical llms: full-parameter vs. parameter-efficient approaches</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Apr 23, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2404.14779">http://arxiv.org/abs/2404.14779</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sukeda</surname><given-names>I</given-names> </name><name name-style="western"><surname>Suzuki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sakaji</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kodera</surname><given-names>S</given-names> </name></person-group><article-title>JMedLoRA: medical domain adaptation on japanese large language models using instruction-tuning</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  Dec 1, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2310.10083">http://arxiv.org/abs/2310.10083</ext-link></comment><pub-id pub-id-type="doi">10.36922/aih.2695</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dettmers</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pagnoni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Holtzman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zettlemoyer</surname><given-names>L</given-names> </name></person-group><article-title>QLoRA: efficient finetuning of quantized llms</article-title><source>arXiv</source><access-date>2025-07-14</access-date><comment>Preprint posted online on  May 23, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2305.14314">http://arxiv.org/abs/2305.14314</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cao</surname><given-names>B</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>W</given-names> </name></person-group><article-title>AI triage or manual triage? Exploring medical staffs&#x2019; preference for AI triage in China</article-title><source>Patient Educ Couns</source><year>2024</year><month>02</month><volume>119</volume><fpage>108076</fpage><pub-id pub-id-type="doi">10.1016/j.pec.2023.108076</pub-id><pub-id pub-id-type="medline">38029576</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stewart</surname><given-names>J</given-names> </name><name name-style="western"><surname>Freeman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Eroglu</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Attitudes towards artificial intelligence in emergency medicine</article-title><source>Emerg Med Australas</source><year>2024</year><month>04</month><volume>36</volume><issue>2</issue><fpage>252</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1111/1742-6723.14345</pub-id><pub-id pub-id-type="medline">38044755</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Katirai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yamamoto</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Kogetsu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kato</surname><given-names>K</given-names> </name></person-group><article-title>Perspectives on artificial intelligence in healthcare from a Patient and Public Involvement Panel in Japan: an exploratory study</article-title><source>Front Digit Health</source><year>2023</year><volume>5</volume><fpage>1229308</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2023.1229308</pub-id><pub-id pub-id-type="medline">37781456</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>WW</given-names> </name><name name-style="western"><surname>Chia</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Cheung</surname><given-names>MF</given-names> </name><etal/></person-group><article-title>Risk perception, acceptance, and trust of using AI in gastroenterology practice in the Asia-Pacific region: web-based survey study</article-title><source>JMIR AI</source><year>2024</year><month>03</month><day>7</day><volume>3</volume><issue>1</issue><fpage>e50525</fpage><pub-id pub-id-type="doi">10.2196/50525</pub-id><pub-id pub-id-type="medline">38875591</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name><name name-style="western"><surname>Virani</surname><given-names>V</given-names> </name><name name-style="western"><surname>Wicks</surname><given-names>P</given-names> </name></person-group><article-title>Patients&#x2019; utilization and perception of an artificial intelligence-based symptom assessment and advice technology in a British primary care waiting room: exploratory pilot study</article-title><source>JMIR Hum Factors</source><year>2020</year><month>07</month><day>10</day><volume>7</volume><issue>3</issue><fpage>e19713</fpage><pub-id pub-id-type="doi">10.2196/19713</pub-id><pub-id pub-id-type="medline">32540836</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Meczner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Burslam-Dawe</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hayhoe</surname><given-names>B</given-names> </name></person-group><article-title>Triage errors in primary and pre-primary care</article-title><source>J Med Internet Res</source><year>2022</year><month>06</month><day>24</day><volume>24</volume><issue>6</issue><fpage>e37209</fpage><pub-id pub-id-type="doi">10.2196/37209</pub-id><pub-id pub-id-type="medline">35749166</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>D</given-names> </name><name name-style="western"><surname>Timsina</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gorenstein</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Traditional machine learning, deep learning, and BERT (Large Language Model) approaches for predicting hospitalizations from nurse triage notes: comparative evaluation of resource management</article-title><source>JMIR AI</source><year>2024</year><month>08</month><day>27</day><volume>3</volume><issue>1</issue><fpage>e52190</fpage><pub-id pub-id-type="doi">10.2196/52190</pub-id><pub-id pub-id-type="medline">39190905</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Masanneck</surname><given-names>L</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>L</given-names> </name><name name-style="western"><surname>Seifert</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Triage performance across large language models, ChatGPT, and untrained doctors in emergency medicine: comparative study</article-title><source>J Med Internet Res</source><year>2024</year><month>06</month><day>14</day><volume>26</volume><issue>1</issue><fpage>e53297</fpage><pub-id pub-id-type="doi">10.2196/53297</pub-id><pub-id pub-id-type="medline">38875696</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Starke</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gille</surname><given-names>F</given-names> </name><name name-style="western"><surname>Termine</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Finding consensus on trust in AI in health care: recommendations from a panel of international experts</article-title><source>J Med Internet Res</source><year>2025</year><month>02</month><day>19</day><volume>27</volume><issue>1</issue><fpage>e56306</fpage><pub-id pub-id-type="doi">10.2196/56306</pub-id><pub-id pub-id-type="medline">39969962</pub-id></nlm-citation></ref></ref-list></back></article>