<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e73274</article-id><article-id pub-id-type="doi">10.2196/73274</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Language Model for Pediatric Occupational Therapy Documentation: Model Development and Pilot Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>DiMaio</surname><given-names>Rachel</given-names></name><degrees>MASc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tuinstra</surname><given-names>Tia</given-names></name><degrees>MASc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yu</surname><given-names>Trevor</given-names></name><degrees>MASc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Koshy</surname><given-names>Ilona</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wylie-Toal</surname><given-names>Brendan</given-names></name><degrees>MASc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tripp</surname><given-names>Bryan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Systems Design Engineering, Faculty of Engineering, University of Waterloo</institution><addr-line>200 University Ave W</addr-line><addr-line>Waterloo</addr-line><country>Canada</country></aff><aff id="aff2"><institution>KidsAbility Centre for Child Development</institution><addr-line>Waterloo</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff3"><institution>School of Environment, Enterprise and Development, Faculty of the Environment, University of Waterloo</institution><addr-line>Waterloo</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Malin</surname><given-names>Bradley</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ajayi</surname><given-names>David</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kimura</surname><given-names>Eizen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Keerthana</surname><given-names>Garapati</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Kuan-Hsun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Dehkordi</surname><given-names>Mahshad Koohi Habibi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mansoor</surname><given-names>Masab</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Rachel DiMaio, MASc, Department of Systems Design Engineering, Faculty of Engineering, University of Waterloo, 200 University Ave W, Waterloo, N2L 3G1, Canada, 1 (519) 888-4567; <email>rmdimaio@uwaterloo.ca</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>15</day><month>5</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e73274</elocation-id><history><date date-type="received"><day>03</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>07</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>28</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Rachel DiMaio, Tia Tuinstra, Trevor Yu, Ilona Koshy, Brendan Wylie-Toal, Bryan Tripp. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 15.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e73274"/><abstract><sec><title>Background</title><p>In occupational therapy, progress notes and other client-related administrative tasks are essential for providing treatment but are time-consuming. Therapists spend at least as much time on these tasks as providing care, which contributes to growing waitlists.</p></sec><sec><title>Objective</title><p>This study aimed to create a custom large language model to make the process of writing progress notes more efficient by converting point-form scratch notes from pediatric occupational therapy treatment sessions into draft documentation in subjective-objective-assessment-plan format.</p></sec><sec sec-type="methods"><title>Methods</title><p>Using a dataset of redacted historical progress notes, various training methods, including domain-adaptive pretraining and low-rank adaptation fine-tuning, were applied to train Llama 2 and 3 models. Since the historical notes lacked corresponding scratch notes, few-shot prompting with human-in-the-loop evaluations was used to generate synthetic scratch notes. This pairing of historical notes and generated scratch notes enabled effective fine-tuning of the Llama models on the desired task. The final model, a fine-tuned Llama 3 8B Instruct model, was piloted in a pediatric rehabilitation center and compared with Microsoft Copilot. Ten therapists used both models for 3 weeks each.</p></sec><sec sec-type="results"><title>Results</title><p>The custom model notes scored higher than manually written notes on clarity, completeness, relevance, and organization (<italic>P</italic>&#x003C;.001) , and similarly on conciseness. They scored higher than those from Copilot on conciseness (<italic>P</italic>&#x003C;.001). However, in this small pilot, a significant reduction in time spent on documentation when using the custom model versus manual notes was not detected. Follow-up investigation revealed that time savings were observed only when therapists were coached to write sparse scratch notes; however, they tended to revert to detailed notes after coaching for which the model was not shown to improve efficiency.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The model had the capacity to save time when therapists provided brief input to the model. However, in practice, therapists preferred to provide detailed input. Used in this way, the model improved note quality rather than saving time.</p></sec></abstract><kwd-group><kwd>SOAP notes</kwd><kwd>large language models</kwd><kwd>occupational therapy</kwd><kwd>documentation</kwd><kwd>model development</kwd><kwd>electronic medical records</kwd><kwd>natural language processing</kwd><kwd>pilot study</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Clinical documentation is essential for continuity of care. Documentation has also become important for billing and legal protection in recent decades. These requirements have added complexity and volume, so that documentation requirements have come to compete with patient care and contribute to clinician burnout [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>], which can lead to medical errors [<xref ref-type="bibr" rid="ref6">6</xref>] and early retirement [<xref ref-type="bibr" rid="ref7">7</xref>]. While physicians spend about half their time on documentation and administrative tasks [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], this burden is particularly acute in pediatric rehabilitation. Internal data from KidsAbility, the primary affiliation of some of the authors, suggest that therapists spend about 50% more time on client-related administrative work than interacting with clients. This creates barriers to access. For example, in our region, waitlists are growing longer as demand increases while therapist capacity is flat [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. The process of completing documentation and other administrative tasks can also be repetitive and tedious for the occupational therapists, reducing job satisfaction [<xref ref-type="bibr" rid="ref12">12</xref>]. The primary goal of this project was to develop a system that uses machine learning to reduce the amount of time that clinicians spend on documentation, with a focus on the needs of pediatric rehabilitation clinicians, including an emphasis on data privacy.</p><p>Machine learning systems are becoming widely adopted to reduce the burden of clinical documentation, performing tasks such as selecting billing codes and generating clinical reports and notes [<xref ref-type="bibr" rid="ref13">13</xref>]. Many clinicians use automated scribe tools to create draft documentation from transcribed conversations with patients. These systems generate new text, summarize relevant findings, and/or retrieve information from existing sources. Some systems are also multimodal, using vision models to incorporate information from medical images [<xref ref-type="bibr" rid="ref14">14</xref>]. Clinicians typically make amendments to these automated notes [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Biomedical large language models (LLMs) have the potential to reduce time spent by clinicians on documentation based on their success in medical language tasks [<xref ref-type="bibr" rid="ref13">13</xref>]. Frontier models such as ChatGPT have been reported to perform well in these applications [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], and others have reported improvements with fine-tuned custom models [<xref ref-type="bibr" rid="ref18">18</xref>]. The use of commercial models can raise questions about the privacy of health information because it requires sending patient data to artificial intelligence (AI) companies. Additionally, large generalist models have infrastructure requirements that can hinder their use in custom applications [<xref ref-type="bibr" rid="ref19">19</xref>], and their routine use poses a threat to the environment [<xref ref-type="bibr" rid="ref20">20</xref>]. Increases in model size are associated with higher computational, energy, and hardware demands during training and deployment, resulting in increased environmental impact. However, there are a number of smaller open-source LLMs with increasingly competitive performance in a variety of language tasks, which could potentially be fine-tuned to specialize in narrow clinical documentation tasks, performing them more efficiently, and under the control of the health care provider [<xref ref-type="bibr" rid="ref21">21</xref>].</p></sec><sec id="s1-2"><title>Goal of This Study</title><p>The study explores the customization of a small LLM to write progress notes for occupational therapy appointments, a key documentation task in pediatric rehabilitation. Many therapists write progress notes based on point-form scratch notes taken during the appointment. The system was designed to convert these scratch notes into draft progress notes in the widely used subjective-objective-assessment-plan (SOAP) format [<xref ref-type="bibr" rid="ref22">22</xref>]. The therapists would be required to review and edit each draft and would retain legal responsibility for their content. The research team developed and tested a system for this purpose in consultation with therapists at KidsAbility, a pediatric rehabilitation center in Southwestern Ontario. KidsAbility provides services for children and youth with communication, physical, or developmental needs, including occupational therapy (OT). A previously published abstract summarizes the key points of this research [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s1-3"><title>Prior Work</title><p>Most of the previous work in clinical note generation has used visit transcripts as input [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref27">27</xref>]. This task is closely related to abstractive summarization and particularly to abstractive summarization of meetings [<xref ref-type="bibr" rid="ref28">28</xref>]. In an early work, Enarvi et al [<xref ref-type="bibr" rid="ref29">29</xref>] reported that transformers outperformed recurrent networks in this task. Krishna et al [<xref ref-type="bibr" rid="ref25">25</xref>] compared several methods of generating SOAP notes from visit transcripts and found that it was helpful to separate the pipeline into multiple stages that first recognized and extracted salient utterances, then clustered these into groups of related utterances, and then generated a SOAP note sentence related to each cluster. Ramprasad et al [<xref ref-type="bibr" rid="ref27">27</xref>], using transformer models, found that training different cross-attention parameters for each SOAP note section improved performance. The MEDIQA-Chat 2023 competition included the task of generating SOAP notes from patient-doctor conversations [<xref ref-type="bibr" rid="ref30">30</xref>]. The winning entry [<xref ref-type="bibr" rid="ref16">16</xref>] used few-shot inference with GPT-4, importantly selecting few-shot examples based on similarity with the input conversation. Biswas and Talukdar [<xref ref-type="bibr" rid="ref17">17</xref>] also reported strong results using GPT-4 compared with several other models. Chen and Hirschberg [<xref ref-type="bibr" rid="ref31">31</xref>] compared the summarization abilities of the 2 methods that performed best in MEDIQA-Chat (fine-tuning LLMs and GPTs) and showed that explicit prompting for SOAP style notes led to outputs with relevant information in all SOAP categories. More generally, LLMs such as ChatGPT have been shown to be effective in medical dialogue summarization, focusing on relevant medical facts [<xref ref-type="bibr" rid="ref32">32</xref>].</p><p>At the time of writing, there are a number of commercial products that transcribe patient visits and generate various notes and reports, such as DeepScribe [<xref ref-type="bibr" rid="ref33">33</xref>], Tali [<xref ref-type="bibr" rid="ref34">34</xref>], Nuance Dragon Medical One [<xref ref-type="bibr" rid="ref35">35</xref>], and AutoScribe [<xref ref-type="bibr" rid="ref36">36</xref>]. Despite positive clinical reception, a recent study of one such tool found that editing was needed to maintain document quality, and that once this step was accounted for, the tool reduced documentation time by less than 10% [<xref ref-type="bibr" rid="ref37">37</xref>]. Consistent with this, Knoll et al [<xref ref-type="bibr" rid="ref15">15</xref>] reported that physicians felt that the value of such tools lay in allowing them to focus more on patients rather than in saving time.</p><p>Certain specifics of pediatric rehabilitation motivate the alternative approach of using scratch notes instead of visit transcripts as input. Pediatric rehabilitation transcripts are fragmented due to interaction between therapist, parent, and child. The visits tend to be longer than routine primary care visits (typically at least 45 minutes), and much of the conversation between therapists and children is only loosely related to the SOAP note. For example, some of therapists&#x2019; speech is meant to keep young children entertained. Furthermore, pediatric rehabilitation clients are typically less able to verbalize concerns due to factors including disability and the young age of clients, including preverbal clients. Overall, it is difficult to capture the treatment in dialogue. In contrast, physician SOAP notes often contain closely related (or even verbatim) [<xref ref-type="bibr" rid="ref27">27</xref>] text from the transcript, such as the patient&#x2019;s description of symptoms. Unspoken observations have been identified as a challenge [<xref ref-type="bibr" rid="ref15">15</xref>], which may be even more prominent in pediatric rehabilitation. LLMs have a tendency to confabulate unknown details [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>], and the fact that pediatric rehabilitation transcripts do not explicitly contain much of the necessary information may exacerbate this. Consistent with these challenges, therapists at KidsAbility reported mixed success in preliminary testing of a commercial transcription-based system. Finally, and most importantly, a large fraction of therapists work in public settings such as classrooms with many other students where recording and transcription are not possible. Scratch notes provide an input that is more focused, with less irrelevant information that might confuse the model, and that includes unspoken observations.</p><p>While this work was in review, there have been several related works published. For instance, Du et al [<xref ref-type="bibr" rid="ref40">40</xref>] published a systematic review of the use of LLMs in electronic medical record (EMR) applications, finding 196 papers that used LLMs to analyze real-world EMR data. Notably, Du et al found that the majority (122/196, 62.2%) involved clinical decision support, with only 5.6% (11/196) of the studies related to text summarization tasks. Furthermore, the authors noted that many disciplines were not represented and that there is not yet a clinically meaningful evaluation framework that can be used for LLM applications. A more recent study that focuses on the summarization of EMR documentation is Dehkordi et al [<xref ref-type="bibr" rid="ref41">41</xref>], which used a combination of highlights distinguishing detailed information in discharge notes and prompt engineering to improve the quality of summaries written by an LLM. Van Veen et al [<xref ref-type="bibr" rid="ref42">42</xref>] showed that LLMs could outperform medical experts at the summarization of EMR data including progress notes, positing that LLMs could potentially be used in this capacity to help alleviate clinician documentation burden. However, a systematic review conducted by Bednarczyk et al [<xref ref-type="bibr" rid="ref43">43</xref>] stresses that reliable performance assessments and clinical impact evaluations are largely absent from literature focusing on EMR summarization tasks. In the domain of clinical note generation using LLMs, there are several examples of frameworks developed to alleviate clinician documentation burden, including a framework for zero-shot and few-shot generation of emergency medical services documentation based on transcripts by Bai et al [<xref ref-type="bibr" rid="ref44">44</xref>]; a pipeline to generate SOAP notes enhanced with keywords from doctor-patient conversations by Li et al [<xref ref-type="bibr" rid="ref45">45</xref>]; SpecialtyScribe by Goyal et al [<xref ref-type="bibr" rid="ref46">46</xref>], a set of modules for generating SOAP notes specific to a particular discipline based on transcripts; and a multimodal framework for generating SOAP notes based on skin lesion images and sparse clinical text by Kamal et al [<xref ref-type="bibr" rid="ref47">47</xref>].</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>The following subsections describe the data curation and model development processes. The model pipeline is summarized in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Our best-performing pipeline for custom language model development via fine-tuning Llama 3 8B Instruct. OT: occupational therapy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e73274_fig01.png"/></fig><sec id="s2-2"><title>Ethical Considerations</title><p>The use of historical EMR data for research purposes was approved by the University of Waterloo Research Ethics Board (institutional review board no. 45491). Specifically, the secondary use of these data was approved for research purposes without the requirement of additional informed consent. While personal health information (PHI) was not needed for the study and was explicitly excluded where possible by KidsAbility authorized professionals, the possibility of incidental PHI in the content of the notes used for training necessitated the approval of the ethics board. The families at KidsAbility provided general consent for the use of their data for research purposes that are Personal Health Information Protection Act&#x2013;compliant. The parents were given the opportunity to restrict the use of their child&#x2019;s data to not include research projects when they signed the KidsAbility Privacy policy. In the rare cases that this was requested, those data were excluded in this project. The data were made available in an environment on KidsAbility authorized servers that was compliant with the Personal Health Information Protection Act. All linked PHI was removed before the research team accessed the data, including all metadata concerning the client, such as their name, age, or location. Since LLMs can memorize portions of training data and potentially output them [<xref ref-type="bibr" rid="ref48">48</xref>], any incidental PHI that appeared in the body of the text, such as first names, was also redacted. The redaction process was completed using a combination of the Amazon Comprehend service [<xref ref-type="bibr" rid="ref49">49</xref>] and custom redaction code. First, the Comprehend service was set to automatically remove any names or ages from the text, replacing any found with the strings [NAME] and [AGE], respectively. The custom redaction code used an unlinked list of first names from KidsAbility to find and replace any remaining names.</p></sec><sec id="s2-3"><title>Datasets</title><sec id="s2-3-1"><title>Overview</title><p>To minimize editing after generation, the model was designed to produce draft SOAP notes in the same style as clinicians at KidsAbility. Ideal training data for this purpose would include historical SOAP notes linked to corresponding point-form scratch notes. However, because clinicians do not typically save their scratch notes, such paired data were not available.</p><p>KidsAbility had access to tens of thousands of historical SOAP notes stored in their previous EMR, GoldCare, which were migrated to their new system, AlayaCare, in 2023. The data were deidentified and made accessible to the research team via Amazon Web Services. Additionally, historical progress notes from AlayaCare were later provided to the research team within the same secure infrastructure.</p></sec><sec id="s2-3-2"><title>Dataset Formatting</title><p>During data review, the research team found systematic differences between notes authored before and after the migration. Notes from AlayaCare, collected between January 2023 and March 2024, followed a more consistent SOAP structure, due to the use of standardized templates in the new EMR. In contrast, the older notes often lacked clear section headings and were highly variable in style. For clarity, these datasets are referred to as the GoldCare dataset (larger body of historical data from prior to the EMR migration) and the AlayaCare dataset (the smaller but better formatted dataset of notes from after the migration).</p></sec><sec id="s2-3-3"><title>GoldCare Dataset</title><p>The GoldCare dataset consisted of 411,812 progress notes taken from 2012 to 2023. The data were filtered to include only OT progress notes, excluding speech-language therapy and physiotherapy notes. After this filtering, the dataset contained 37,437 OT notes.</p><p>The GoldCare dataset was also filtered by token length, using the Llama 2 7B tokenizer, to include notes with only 50-1000 tokens, accounting for the majority of the notes within the distribution. Inspection of examples showed that notes with less than 50 tokens were less likely to be proper SOAP notes, and a 1000-token maximum enabled faster, more cost-effective pretraining with a sequence length of 1024.</p><p>Furthermore, the dataset was divided into separate datasets including a domain-adaptive pretraining (DAPT) dataset (90%) and a fine-tuning dataset (10%). The division of the full dataset considered client IDs such that individual clients appeared only in one dataset or the other. Finally, the fine-tuning dataset was filtered once more to exclude any notes under 150 tokens since further inspection revealed that notes in the range of 50-150 tokens were less likely to conform to SOAP format. The pretraining dataset was not filtered further. This left 2298 notes in the fine-tuning dataset and 25,164 notes in the pretraining dataset.</p></sec><sec id="s2-3-4"><title>AlayaCare Fine-Tuning Dataset</title><p>The AlayaCare dataset originally consisted of 1127 SOAP notes from OT appointments, which were used only for fine-tuning as the dataset was not large enough to support DAPT. This dataset was filtered to include notes with 100-1000 tokens, using the Llama 3 8B tokenizer, which is similar to the Llama 2 7B tokenizer. A smaller minimum length was allowed than in the GoldCare fine-tuning dataset because these notes were more reliably structured despite shorter token lengths. After filtering, the AlayaCare fine-tuning dataset comprised 973 notes in total.</p></sec></sec><sec id="s2-4"><title>Scratch Note Generation</title><p>Achieving good performance with moderately sized LLMs often requires fine-tuning them on the task of interest [<xref ref-type="bibr" rid="ref16">16</xref>]. For this purpose, examples of paired scratch notes (inputs) and SOAP notes (outputs) were necessary. However, the historical data did not contain scratch notes, so synthetic scratch notes corresponding to each SOAP note in the fine-tuning datasets were generated using few-shot inference with a Llama 2 70B model. A small set of high-quality examples illustrating the task (creating a scratch note based on a progress note) was provided in the prompt. The examples consisted of 45 scratch note and SOAP note pairs provided by clinicians at KidsAbility. Larger models such as Llama 2 70B excel in few-shot inference, mimicking the style and structure of provided examples [<xref ref-type="bibr" rid="ref50">50</xref>], and were used here to capture the style and structure of KidsAbility scratch notes.</p><p>Past studies using LLMs to generate or augment training data [<xref ref-type="bibr" rid="ref51">51</xref>-<xref ref-type="bibr" rid="ref53">53</xref>] have found that it is important to optimize the quality of the generated data. Evaluation of factors such as realism, relevance, and biases can help ensure that the generated data are effective for fine-tuning. Accordingly, a human-in-the-loop evaluation process was used to find an effective generation approach that produced text of consistent quality when provided with a single example scratch note and SOAP note pair. A generation approach was iteratively refined, varying prompt formulations, sampling generation temperatures, and postprocessing steps. The effect of temperature was explored by varying the generation temperature within a predefined range (0.5&#x2010;0.9) while holding the other parameters constant, after which a temperature (0.8) was chosen based on qualitative assessment by the research team. The effect of different prompts and postprocessing steps was evaluated by clinician collaborators. Three OT clinicians scored scratch notes that were generated with different parameters and different few-shot examples in a 3-step evaluation process. The first step included 100 scratch notes, while the second and third included 80 each. In total, 7 different prompt formulations, 5 randomly selected few-shot example subsets, and 2 postprocessing techniques were assessed. To evaluate output stability, multiple scratch notes (n=4) were also generated with the same parameters.</p><p>The clinicians used a structured rubric to independently evaluate the scratch notes on realism, accuracy, and appropriate distinctions (between subjective reporting, goals, and observations). Each criterion was scored on a 5-point Likert scale (1 being the worst, and 5 being the best). An additional criterion, brevity, did not require clinical expertise and was therefore assessed by the research team. This criterion considered whether the model generated unwanted chat text such as, &#x201C;Sure, here is a scratch note.&#x201D; Additionally, the clinicians were asked to flag any notes that they considered particularly inaccurate or problematic and to provide qualitative feedback.</p><p>The scores were averaged across raters for each combination of prompt and postprocessing strategy. Descriptive statistics (mean and SD) together with qualitative feedback were used to select the final generation approach. The raters were generally concordant, but a formal ICC was not calculated. Using the selected parameters, the scratch notes were generated with all curated examples. For each generation, a few-shot example was selected programmatically, cycling through the available set to promote variability while preserving stylistic consistency.</p></sec><sec id="s2-5"><title>Model Training</title><sec id="s2-5-1"><title>Overview</title><p>The models chosen for training were the Llama 2 7B Chat model and then the Llama 3 8B Instruct model after its release in 2024. The smallest Llama models were chosen to minimize training and inference compute costs. Compared with other open-source LLMs, the Llama models are well documented, perform well on benchmarks, and performed best in our initial, exploratory testing. The Chat and Instruct versions of these models are those that have undergone fine-tuning and posttraining that refine the models&#x2019; ability to respond to chat prompts and instructions, which was desired for the given task.</p><p>For both DAPT and fine-tuning on the SOAP note task, the models were trained through causal language modeling (CLM), which is a self-supervised training process that tasks the model with autoregressively predicting the next token in a sequence [<xref ref-type="bibr" rid="ref54">54</xref>]. The model was trained for a single epoch to reduce the risk of memorizing the training data and compromising privacy [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. The hyperparameters were based on those used in the Llama 2 fine-tuning process [<xref ref-type="bibr" rid="ref57">57</xref>]. Further implementation details, including the hyperparameters, can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-5-2"><title>Domain-Adaptive Pretraining</title><p>LLMs are pretrained on massive text corpora. DAPT is a form of transfer learning that involves continued pretraining using CLM on text from a specific domain. DAPT is used to train a general pretrained model to understand domain-specific vocabulary and nuances and to produce text more suitable to the target domain [<xref ref-type="bibr" rid="ref58">58</xref>]. The GoldCare pretraining dataset, composed of 25,164 historical progress notes, was used for DAPT. Training was performed for 1 epoch on the Llama 2 7B Chat model.</p></sec><sec id="s2-5-3"><title>Fine-Tuning</title><p>In contrast with DAPT, fine-tuning is used to adapt a model to a particular task, rather than just a particular kind of text. Parameter-efficient fine-tuning was explored, specifically low-rank adaptation (LoRA), a prevalent method that trains large models efficiently in terms of both computation and the size of the fine-tuning dataset [<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>]. This method uses low-rank adapters, <italic>n</italic> &#x00D7; <italic>n</italic> weight matrices that consist of an outer product of 2 low-dimensional weight matrices (eg, <italic>n</italic> &#x00D7; 2 and 2 &#x00D7; <italic>n</italic>), forcing these weights to have low rank and few parameters. A low-rank weight matrix is summed with an existing <italic>n</italic> &#x00D7; <italic>n</italic> weight matrix. Only the low-rank adapters are optimized during LoRA fine-tuning while the original parallel weights are frozen.</p><p>The GoldCare and AlayaCare fine-tuning datasets were used for different fine-tuning training runs. Each of these datasets was further split into train and validation datasets, such that 80% of the data were in the train dataset and 20% were used for evaluation. Supervised fine-tuning was performed for a single epoch. The performance of these variations was evaluated as described in the &#x201C;Qualitative Model Evaluation&#x201D; section. The best-performing process, with a fine-tuned Llama 3 8B Instruct model, leading to a version that was piloted with clinicians, is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p></sec></sec><sec id="s2-6"><title>Qualitative Model Evaluation</title><p>There are no standard benchmarks for evaluating LLMs in SOAP note generation or related health care tasks [<xref ref-type="bibr" rid="ref61">61</xref>]. For this reason, manual evaluations were performed to determine which model variation to pilot with therapists. These evaluations compared LoRA models vs fully fine-tuned models, models with vs without DAPT, Llama 2 vs Llama 3 models, and models trained on GoldCare vs AlayaCare data fine-tuning datasets. Additionally, the fine-tuned models were compared against Llama 2 Chat and Llama 3 Instruct models that had not been fine-tuned. These preliminary evaluations were performed by the research team.</p><p>It was sometimes obvious which model produced a worse output, for example, if one failed to follow the desired format, or copied verbatim from scratch notes. However, in other comparisons, each model had strengths and weaknesses that had to be examined more systematically. Thus, independent evaluations were performed by 3 members of the research team.</p><p>For these evaluations, 10 of the curated scratch notes were used, paired with ground-truth SOAP notes, and SOAP notes generated by 2 of the best models. The research team voted on which of the generated outputs was superior overall, considering realism, accuracy, clarity, proper formatting, the extent to which the model followed instructions (eg, &#x201C;Use point-form&#x201D;), readability, and qualitative similarity (in terms of voice and style) with the ground-truth SOAP note.</p></sec><sec id="s2-7"><title>Pilot Study</title><sec id="s2-7-1"><title>Overview</title><p>The best-performing model, as determined by the qualitative evaluation, was incorporated into a web-based user interface and piloted with a group of 10 occupational therapists at KidsAbility. The goal of the pilot was to compare timing and quality measures between the traditional manual process, the custom model, and Microsoft&#x2019;s Copilot application, which is based on GPT-4. Due to constraints such as the timing of the study and the availability of clinicians, the pilot was sequential, noncrossover, and nonrandomized. The occupational therapists first wrote all their SOAP notes using Copilot for3 weeks. For each SOAP note, they submitted an online form that indicated the time and date of the note and the self-reported time spent writing it. After a week break from the pilot during which clinicians could either continue using Copilot or revert to a manual process, the clinicians used the custom model for a period of 3 weeks. In this stage, our web application automatically saved their scratch notes, generated draft SOAP notes, and edited SOAP notes. As before, they submitted a form with self-reported time spent on each note. This self-reported time included time typing scratch notes, interacting with the model, and editing the generated SOAP note. All notes required clinician review and editing before being submitted.</p></sec><sec id="s2-7-2"><title>SOAP Note Quality Evaluation</title><p>SOAP note quality was evaluated by a separate research group to reduce bias [<xref ref-type="bibr" rid="ref62">62</xref>]. Briefly, 4 clinicians, each with at least 2 years of experience in providing pediatric rehabilitation treatment, independently reviewed 256 SOAP notes. The SOAP notes included 64 notes from 4 different categories: manually written SOAP notes (Non-AI Notes), notes written by Copilot and edited by the clinicians (Copilot Edited), unedited SOAP notes from the custom model (Custom), and notes from the custom model that had been edited (Custom Edited). Each clinician was given a rubric with 5 criteria (Clear, Complete, Concise, Relevant, and Organized) on which to score the 256 SOAP notes and a spreadsheet that contained the SOAP notes in a randomized order. The clinicians were not informed of the category of each note, although this may sometimes have been apparent from the note contents, as the manual notes varied more widely in style.</p></sec><sec id="s2-7-3"><title>Objective Timing Data</title><p>Although self-reported timing data were collected during the pilot, the research team sought more precise timing data. Therapists often paused or switched to other tasks while working on SOAP notes, so that automated time stamps would not accurately reflect the time used to write the notes. To collect more accurate timing data, an experimenter observed clinicians and collected timing data in virtual meetings. Four clinicians participated and were asked to write 2-5 SOAP notes per meeting. The clinicians were asked to refrain from beginning their note, including typing any scratch notes, prior to the call. During the virtual meeting, a method for writing each note was chosen at random, with a 50% chance of using the custom model or manually writing the note. A timer was started when the occupational therapist had opened the application they intended to use and were ready to start typing. The timer was stopped once the occupational therapist reported submitting the note in the AlayaCare EMR.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Scratch Note Generation</title><p>The best variation of few-shot prompting achieved average scores of 4.55 for realism, 4.15 for accuracy, and 4.3 for appropriate distinctions between sections. The worst variation tested achieved scores of 3.5, 3.45, and 3.4 in these categories. The most important factor in performance was the system prompt. The best system prompt is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. An enumerated list of explicit requirements was effective. Also, when this prompt was used repeatedly on the same input, the outputs received consistently high scores. The instructional prompt and specific few-shot examples had less impact.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Final prompt structure used for scratch note generation. SOAP: subjective-objective-assessment-plan.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e73274_fig02.png"/></fig></sec><sec id="s3-2"><title>Model Training Metrics</title><p>DAPT reduced the training loss metric for the Llama 2 7B Chat model from 2.5 at the beginning of training to 1.4 at the end, suggesting that this training improved the model&#x2019;s ability to perform CLM on historical progress notes. <xref ref-type="table" rid="table1">Table 1</xref> shows the reduction in loss with training for each fine-tuned model as well as the corresponding training times. The impacts of DAPT and LoRA versus full fine-tuning with Llama 2 on the GoldCare dataset were evaluated first. The loss values suggested that DAPT was detrimental to fine-tuning, and that full fine-tuning outperformed LoRA (<xref ref-type="table" rid="table1">Table 1</xref>). Full fine-tuning without DAPT with both Llama 2 and Llama 3, on both the GoldCare and AlayaCare datasets, was subsequently tested. All the fine-tuned models showed consistent improvement in training and evaluation loss across all training runs.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Fine-tuning: runtime and impact on loss.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset</td><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Final training loss (as percent initial training loss)</td><td align="left" valign="bottom">Evaluation loss (as percent initial training loss)</td><td align="left" valign="bottom">Training runtime, seconds</td></tr></thead><tbody><tr><td align="left" valign="top">AlayaCare</td><td align="left" valign="top">Fine-tuned Llama 3</td><td align="left" valign="top">82.5</td><td align="left" valign="top">55.8</td><td align="left" valign="top">715</td></tr><tr><td align="left" valign="top">AlayaCare</td><td align="left" valign="top">Fine-tuned Llama 2</td><td align="left" valign="top">84.6</td><td align="left" valign="top">73.0</td><td align="left" valign="top">639</td></tr><tr><td align="left" valign="top">GoldCare</td><td align="left" valign="top">Fine-tuned Llama 2</td><td align="left" valign="top">73.2</td><td align="left" valign="top">88.8</td><td align="left" valign="top">580</td></tr><tr><td align="left" valign="top">GoldCare</td><td align="left" valign="top">LoRA<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> and fine-tuned Llama 2</td><td align="left" valign="top">93.5</td><td align="left" valign="top">84.0</td><td align="left" valign="top">586</td></tr><tr><td align="left" valign="top">GoldCare</td><td align="left" valign="top">DAPT<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> and fine-tuned Llama 2</td><td align="left" valign="top">89.4</td><td align="left" valign="top">83.1</td><td align="left" valign="top">1727</td></tr><tr><td align="left" valign="top">GoldCare</td><td align="left" valign="top">DAPT and LoRA fine-tuned Llama 2</td><td align="left" valign="top">93.5</td><td align="left" valign="top">84.0</td><td align="left" valign="top">586</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LoRA: low-rank adaptation.</p></fn><fn id="table1fn2"><p><sup>b</sup>DAPT: domain-adaptive pretraining.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Qualitative Analysis of Model Outputs</title><p>Consistent with the higher loss of the DAPT vs non-DAPT Llama 2 model (<xref ref-type="table" rid="table1">Table 1</xref>), the outputs from the DAPT Llama 2 model were of low quality, even compared with the original Llama 2 7B Chat model. The DAPT model produced notes that were inappropriately short, missed important details, and often misrepresented the content of the provided scratch note. In comparison, the original Llama 2 model produced notes that were the appropriate length and format, and which better reflected the scratch note&#x2019;s information, despite also missing important details and misrepresenting information. Final training and evaluation loss are presented as percentage of first-recorded loss because absolute loss values are not comparable between Llama 2 and Llama 3. Lower percentages are better. Fine-tuning was successful in decreasing loss in each case. Fine-tuning was computationally inexpensive.</p><p>The fine-tuned Llama 3 model generated outputs that demonstrated the following strengths: an ability to consistently format the notes as desired, with the SOAP note headings and use of bullet points, as well as an ability to write clear, easy-to-read text. However, the Llama 3 model did not always organize the information well into the different sections. For instance, the model sometimes put observations in the subjective section. Meanwhile, SOAP notes generated with the Llama 2 fine-tuned models tended not to use bullet points despite instructions to do so, did not use the correct headings when trained on the GoldCare dataset (where many of the examples did not have SOAP headings), and were less readable than the Llama 3 model&#x2019;s notes. Notes produced by the Llama 2 models tended to repeat what was written in the given scratch note verbatim more often than the Llama 3 model.</p><p>The LoRA fine-tuned Llama 2 model also tended to produce text that was repetitive, not self-consistent, and lacking correct SOAP headings despite explicit instructions. Models that were pretrained on domain data and then fine-tuned with and without the use of LoRA tended to include extra text that was not present or indicated in the scratch note. An example of such irrelevant text that was often included is, &#x201C;All appropriate COVID-19 precautions taken,&#x201D; despite the scratch notes indicating nothing to this effect.</p><p>When comparing the LoRA fine-tuned models with the equivalent fully fine-tuned models, the outputs of the LoRA models generally included text that was unnecessarily verbose and were more likely to include hallucinations. The fully fine-tuned models tended to be more concise and to stop at the appropriate point in generation.</p><p>From these observations, it was clear that the best-performing models were fully fine-tuned Llama 2 and Llama 3. To compare these models more systematically, each model was used to generate SOAP notes from 10 different scratch notes. Three researchers (RD, TT, and TY) reviewed these 10 pairs of notes and voted independently for whether the Llama 2 or Llama 3 note performed best in each case. Fine-tuned Llama 3 notes received 20 votes in total, whereas fine-tuned Llama 2 received 10, suggesting that Llama 3 was stronger overall. Sample-generated notes can be seen in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Thus, the fine-tuned Llama 3 8B Instruct model (hereafter referred to as the custom model) was chosen for the pilot.</p></sec><sec id="s3-4"><title>Pilot Study Findings</title><sec id="s3-4-1"><title>Qualitative Feedback</title><p>At the start of the KidsAbility Pilot Study, much of the feedback about the custom model included actionable suggestions for improvement. Clinicians requested that the model use a more consistent bullet point marker, which was quickly addressed by adding a find-and-replace function to the code. They also noted that the model produced high-quality plan and analysis sections, especially compared with the Copilot model, and that the text it generated was professional, concise, readable, and familiar in tone. Sample-generated notes can be seen in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> listings 1 and 3 for Llama 3 and Copilot, respectively. Negative feedback included frustrations with the editing process, incorrect placement of information under headings, occasional omissions of details from scratch notes, and rare instances of irrelevant or incorrect text being added.</p><p>Subjective reporting of the time taken to write SOAP notes with the custom model varied from 5 to 26 minutes, with an average of 13.83 (SD 7.10) minutes (n=73). In comparison, the average time for using Copilot was 14.08 (SD 8.99) minutes (n=190). Significant variability in the time required to complete notes with both models was reported, and the difference was not statistically significant between models.</p></sec><sec id="s3-4-2"><title>Dependence of Time Savings on Scratch Note Style</title><p>Some therapists appeared to use the custom model more efficiently than others. Upon investigation, the KidsAbility Innovation Team found that the 2 therapists who reported the best time savings were providing much shorter scratch notes, while other therapists were spending more time to write out detailed scratch notes for the model that already closely resembled full SOAP notes. Following this discovery, the innovation team held one-on-one sessions with 6 of the clinicians, coaching the clinicians to provide sparser scratch notes to the model and to spend less time writing these. Examples of a detailed scratch note and a sparse scratch note can be seen in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><p>The average time taken after this coaching, based on self-reporting, was 7.6 (SD 3) minutes (n=6). An example from one of the coaching sessions illustrates the potential impact of coaching. Initially, a clinician spent 10 minutes writing detailed scratch notes, including SOAP headings. After one-on-one coaching, the clinician produced a much sparser version of the scratch notes in 1 minute, and the entire process was completed in approximately 6.5 minutes. While anecdotal, these observations suggest that the model has the potential to reduce documentation time, but the actual time savings depend strongly on how the model is used.</p></sec><sec id="s3-4-3"><title>Comparison of Unedited and Edited Model Notes</title><p>The clinicians at KidsAbility were accountable for reviewing all generated notes and editing them as appropriate. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) scores [<xref ref-type="bibr" rid="ref63">63</xref>] were calculated to compare each edited model note against the unedited, generated note.</p><p>ROUGE scores measure similarity between a piece of text and a reference piece of text. ROUGE-1 scores are <italic>F</italic><sub>1</sub>-scores for which the recall and precision are calculated using the number of overlapping words. While ROUGE-1 scores provide a measure of the similarity based on the number of individual words that are the same, ROUGE-2 scores are based on 2-word sequences. ROUGE-L and ROUGE-LSUM scores indicate sentence-level similarity and similarity at the level of the entire note, respectively [<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>]. A higher score indicates a higher degree of similarity and, in this case, less editing.</p><p>Sixty-four pairs of edited and unedited notes were compared. The means and SDs of 4 different ROUGE metrics calculated for all model output notes compared with the final edited notes are shown in <xref ref-type="table" rid="table2">Table 2</xref>. All the ROUGE scores were high, indicating light editing at both the word and document levels. Somewhat lower ROUGE-2 and ROUGE-L scores may indicate that most changes are related to word order and sentence structure.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>ROUGE<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> scores comparing custom model notes before and after editing by clinicians (higher scores mean less editing at various levels of granularity).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Score, mean (SD)</td><td align="left" valign="bottom">95% CI</td></tr></thead><tbody><tr><td align="left" valign="top">ROUGE-1</td><td align="char" char="plusmn" valign="top">0.71 (0.17)</td><td align="char" char="." valign="top">0.67-0.75</td></tr><tr><td align="left" valign="top">ROUGE-2</td><td align="char" char="plusmn" valign="top">0.58 (0.25)</td><td align="char" char="." valign="top">0.52-0.64</td></tr><tr><td align="left" valign="top">ROUGE-L</td><td align="char" char="plusmn" valign="top">0.63 (0.24)</td><td align="char" char="." valign="top">0.57-0.69</td></tr><tr><td align="left" valign="top">ROUGE-LSUM</td><td align="char" char="plusmn" valign="top">0.70 (0.18)</td><td align="char" char="." valign="top">0.65-0.74</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>ROUGE: Recall-Oriented Understudy for Gisting Evaluation. </p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4-4"><title>SOAP Note Quality Assessment</title><p>The mean scores for the 5 criteria across the 4 different groups of SOAP notes are shown in <xref ref-type="table" rid="table3">Table 3</xref>, and the corresponding SDs are shown in <xref ref-type="table" rid="table4">Table 4</xref>. The Custom Edited notes had the highest or second-highest scores for all 5 criteria. Meanwhile, the Non-AI notes were consistently scored lowest except on the Conciseness criterion. Finally, the custom model notes had the lowest SDs across all criteria, with the Custom Edited notes having the lowest for 4 out of 5. It is possible that the low SD indicates higher consistency in quality.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Mean scores (out of a maximum of 3) assigned to SOAP<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> notes within each note type category.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Note type</td><td align="left" valign="bottom">Clear</td><td align="left" valign="bottom">Complete</td><td align="left" valign="bottom">Concise</td><td align="left" valign="bottom">Relevant</td><td align="left" valign="bottom">Organized</td></tr></thead><tbody><tr><td align="left" valign="top">Non-AI<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> note</td><td align="left" valign="top">2.24</td><td align="left" valign="top">1.94</td><td align="left" valign="top">2.57</td><td align="left" valign="top">2.61</td><td align="left" valign="top">2.06</td></tr><tr><td align="left" valign="top">Copilot Edited</td><td align="left" valign="top">2.65</td><td align="left" valign="top"><italic>2.66</italic><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">2.34</td><td align="left" valign="top">2.77</td><td align="left" valign="top"><italic>2.51</italic><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Custom</td><td align="left" valign="top">2.50</td><td align="left" valign="top">2.55</td><td align="left" valign="top">2.55</td><td align="left" valign="top">2.76</td><td align="left" valign="top">2.33</td></tr><tr><td align="left" valign="top">Custom Edited</td><td align="left" valign="top"><italic>2.66</italic><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">2.61</td><td align="left" valign="top"><italic>2.57</italic><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top"><italic>2.80</italic><sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">2.49</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>SOAP: subjective-objective-assessment-plan.</p></fn><fn id="table3fn2"><p><sup>b</sup>AI: artificial intelligence.</p></fn><fn id="table3fn3"><p><sup>c</sup>The highest mean scores for each criterion are in italics.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>SD of the scores assigned to SOAP<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> notes averaged across the different scorers within each note type category.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Note type</td><td align="left" valign="bottom">Clear</td><td align="left" valign="bottom">Complete</td><td align="left" valign="bottom">Concise</td><td align="left" valign="bottom">Relevant</td><td align="left" valign="bottom">Organized</td></tr></thead><tbody><tr><td align="left" valign="top">Non-AI<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> note</td><td align="left" valign="top">0.59</td><td align="left" valign="top">0.62</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.52</td><td align="left" valign="top">0.60</td></tr><tr><td align="left" valign="top">Copilot Edited</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.65</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.59</td></tr><tr><td align="left" valign="top">Custom</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.52</td><td align="left" valign="top"><italic>0.56</italic><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0.43</td><td align="left" valign="top">0.64</td></tr><tr><td align="left" valign="top">Custom Edited</td><td align="left" valign="top"><italic>0.48</italic><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top"><italic>0.46</italic><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0.57</td><td align="left" valign="top"><italic>0.33</italic><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top"><italic>0.56</italic><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>SOAP: subjective-objective-assessment-plan.</p></fn><fn id="table4fn2"><p><sup>b</sup>AI: artificial intelligence.</p></fn><fn id="table4fn3"><p><sup>c</sup>The lowest SD for each criterion is in italics.</p></fn></table-wrap-foot></table-wrap><p>Two-factor ANOVA tests with replication on each of the 5 criteria were performed, with the 2 factors being the clinician evaluator and the category of note. Both main factors were significant with <italic>P</italic>&#x003C;.001 across all criteria (17.37 &#x2264; <italic>F</italic><sub>3,1004</sub> &#x2264;126.17). There were also moderate interaction effects between the 2 factors for all criteria, the greatest of which was <italic>F</italic><sub>9,1004</sub>=12.16, <italic>P</italic>&#x003C;.001 for clarity. The ANOVA test was appropriate as a robust omnibus test for Likert-type ratings given the large, balanced sample and approximately symmetrical residuals.</p><p>Pair-wise comparisons between scores of different note categories were performed using Tukey test to correct the significance threshold for multiple comparisons. These tests showed that there was a significant difference (0.25&#x2010;0.77, <italic>P</italic>&#x003C;.001) in mean scores between the Non-AI notes and the Custom Edited notes for all criteria except for conciseness. Conversely, there was a significant difference (0.31, <italic>P</italic>&#x003C;.001) between the mean scores for the custom model notes and Copilot notes only for conciseness. See <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> for further details.</p><p>Interrater reliability was measured using the intraclass correlation coefficient (ICC) metric. The metric varied by criterion, with single-rater reliabilities being low (ICC<sub>3,1</sub>=0.16-0.41), indicating limited interchangeability of clinician evaluators [<xref ref-type="bibr" rid="ref65">65</xref>]. In contrast, reliability of the average rating across clinicians was higher (ICC<sub>3,4</sub>=0.43-0.74).</p></sec><sec id="s3-4-5"><title>Objective Timing Data</title><p>Online meetings were used to record more accurate timing data for a small number of notes. A member of the research team timed the note-taking duration of 18 instances, including 10 using the custom model and 8 manually written. The average time spent on writing notes with the custom model was 9.55 (SD 3.5) minutes (n=10). The manually written notes took an average of 7.84 (SD 3.1) minutes (n=8). Of note is the large degree of variability in the times recorded for both categories of notes. Varied complexity of different notes was a factor in this variation. For example, for one of the custom model notes, the clinician commented that it was much more complex than usual and could be considered an outlier. Similarly, the clinician who manually wrote another note commented that it was shorter than usual because it was for a brief virtual call that did not contain any interaction with the client. Excluding these 2 outliers, the average time taken to write notes with the custom model was 8.76 (SD 2.6) minutes, while the average time for the manual process was 8.38 (SD 2.9) minutes. This difference was not statistically significant.</p><p>Despite the fact that 3 of the 4 clinicians who participated had previously gone through one-on-one coaching to use the model more effectively, they did not appear to implement the more effective strategy. The scratch notes were longer and more detailed than they had been coached previously to write.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overview</title><p>Non&#x2013;direct administrative tasks are a substantial burden in pediatric rehabilitation, contributing to burnout [<xref ref-type="bibr" rid="ref12">12</xref>], as well as reducing therapist capacity and contributing to growing waitlists for care. A quick fix for this issue is not apparent. Therapists perform a variety of administrative tasks that may require varied interventions. Documentation cannot be completely automated because therapists remain accountable for the accuracy and completeness of the documentation, because existing technology produces errors, and because some of the relevant information may be known only to therapists. Producing progress notes is a substantial administrative task, taking about one-third the time of direct care. The recent success of automated documentation in other clinical settings suggests that LLMs may be able to improve the efficiency of this process, an important step in reducing administrative burden overall. A customized LLM was developed and tested for this application, motivated by the goals of optimizing performance, keeping PHI under the control of the health care institution, and minimizing the climate impact associated with use of the model.</p></sec><sec id="s4-2"><title>Governance and Ethical Considerations</title><p>The development and deployment of the custom model were overseen by the KidsAbility Innovation team, and model access was restricted to clinicians within a secure, institutionally authorized environment. System logging was implemented, with data stored securely, enabling review and auditing of generated notes. The clinicians remained legally responsible for the content of the final SOAP notes, with the model notes considered drafts. They were required to review and edit every draft progress note. While the custom model was the best-performing model of those tested during this study, it still produced irrelevant boilerplate text or hallucinations on occasion, so it was imperative that the clinicians carefully reviewed the notes for such errors and removed them in their editing process.</p><p>Several ethical risks were considered in development, including data privacy risks, which were mitigated by deidentifying the training data and removing PHI from note text. Another risk for such a system could be automation bias or clinician overreliance, which was addressed largely by emphasizing the need for human review and accountability. The draft status of the generated notes was made clear, and the clinician was responsible for finalizing or submitting any data to the EMR. Finally, fairness considerations meant that the model was not assumed to generalize across settings, given that the data reflect only KidsAbility patient demographics and documentation practices. Moving forward, it will be important to monitor the model&#x2019;s outputs to ensure that there are no systematic errors or biases.</p></sec><sec id="s4-3"><title>Principal Findings</title><sec id="s4-3-1"><title>SOAP Note Quality Measures</title><p>A surprising result was that progress notes that were produced using this model scored better on multiple quality measures than manually written notes. Specifically, compared with SOAP notes written without the use of LLMs, those written with either Copilot or the custom model were given higher scores for all 5 criteria, apart from Conciseness. Non-AI notes received higher scores for Conciseness than the Copilot model notes but not the edited custom model notes. Therefore, the use of LLMs did not degrade the quality of the SOAP notes but instead helped to produce higher quality SOAP notes.</p><p>Progress notes produced with the custom model also compared favorably with progress notes produced with Microsoft Copilot. Comparing the notes generated with the custom model with the notes generated with the Copilot model, the custom model received higher scores for conciseness regardless of editing, but otherwise the scores for edited notes of both models were similar. Clinicians informally corroborated this finding, reporting that the Copilot notes were longer and contained more &#x201C;fluff&#x201D; than those from the custom model.</p></sec><sec id="s4-3-2"><title>Time Savings</title><p>The impact of the models on therapists&#x2019; time was less clear. During the pilot study, it was found that self-reported time savings varied substantially between therapists. Further investigation suggested that this variation related to different ways in which different therapists used the model. Most clinicians in the pilot wrote long, detailed scratch notes and still spent significant time editing the generated notes. This approach was inefficient, as the model could only rephrase or rearrange points without adding meaningful content. Therapists who used the model more effectively found that writing quick, minimal scratch notes allowed the model to produce a usable draft, which they could then edit. After being coached by the research team, all therapists reported better results and were able to create quality SOAP notes in less time, demonstrating the importance of hands-on training to maximize the model&#x2019;s effectiveness.</p><p>It is unsurprising that writing more detailed scratch notes would reduce the benefit of the system but perhaps more surprising that many therapists were inclined to do so. During the objective timing study, several therapists were observed to have reverted to giving the model long, detailed scratch notes despite previous coaching and positive experience providing short, sparse notes. Further work is needed to understand how best to encourage more efficient use of the system. For example, this could involve user interface changes to show an ideal example, or a limit on the number of words that can be entered in the scratch note interface. Clinicians specifically expressed a wish to provide further prompts or instructions to the model for a given note, which could improve model usability with sparse scratch notes. Alternatively, an analysis of model limitations that may be driving provision of longer scratch notes could be pursued. It is possible that the synthetic training data, which were more well-structured and clearer than typical manual scratch notes, may contribute to model limitations by introducing a bias toward longer and more complete scratch notes. Further fine-tuning of the model on the clinicians&#x2019; scratch notes and the progress notes produced by the model and edited by clinicians could improve model performance. If the model with further fine-tuning were to produce progress notes more aligned with their preferences given sparse scratch notes, clinicians may be less inclined to provide overly long scratch notes. Overall, work to promote efficient use of the system appears to be critical for realizing the efficiency gains that would be needed to impact waitlists for treatment. However, even if the model were used optimally, impact on waitlists would require therapists to take on larger caseloads.</p><p>Clinicians&#x2019; previous experiences with the Copilot model may also have contributed to their initial, less effective use of the custom model. Some had developed biases, believing that detailed input was necessary for generating useful SOAP notes, as they were dissatisfied with Copilot&#x2019;s Analysis and Plan sections when sparse notes were provided. As a result, some clinicians used only Copilot for the Subjective and Objective sections, preferring to write the other parts themselves. This may have influenced their approach to the new model.</p></sec></sec><sec id="s4-4"><title>Model Development Lessons</title><sec id="s4-4-1"><title>DAPT</title><p>The models that underwent DAPT experienced catastrophic forgetting, where their ability to respond to prompts worsened after continued training. See <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> for an example of degraded performance. Specifically, DAPT caused the models to ignore instructions, similar to findings by Cheng et al [<xref ref-type="bibr" rid="ref66">66</xref>], who describe a similar phenomenon in their paper, proposing the use of reading comprehension tasks to reduce this issue by training the model on more diverse prompts. However, their methodology was impractical for reframing the SOAP note dataset. Thus, it is hypothesized that a lack of diversity in the DAPT prompts contributed to the degradation in performance, as the training examples did not contain any explicit instructions and merely presented SOAP notes.</p><p>Huang et al [<xref ref-type="bibr" rid="ref67">67</xref>] also propose a mitigation strategy for catastrophic forgetting due to continued training, using self-synthesized data that are meant to reflect the original training data to retain the model&#x2019;s original proficiencies. Future work on this project could use similar methods to include other types of examples, reflective of original training examples, in the process of DAPT along with the examples of SOAP notes.</p><p>&#x00D6;ncel et al [<xref ref-type="bibr" rid="ref68">68</xref>] explored factors that play a role in whether performance degrades with further training. They found that greater similarity between the original training corpus and the new domain&#x2019;s corpus resulted in more degradation. Given the breadth of the Llama training corpus for Llama 2, it is possible that the original data the model was trained on contained examples very similar to our DAPT data. &#x00D6;ncel et al also found that increasing the model size could mitigate performance degradation. Therefore, it is hypothesized that the small size of the model (relative to other language models) played a role in the observed catastrophic forgetting. An interesting avenue for future work could include training the model on smaller, more curated datasets and determining when catastrophic forgetting begins to emerge.</p><p>Additionally, the models that underwent DAPT were more prone to hallucinating irrelevant text in the SOAP notes, perhaps due to low variety in the DAPT examples. The model often generated repetitive phrases such as COVID-19 protocol text instead of responding dynamically to the scratch notes. &#x00D6;ncel et al [<xref ref-type="bibr" rid="ref68">68</xref>] make a relevant observation in their study, observing that the models tend to improve at predicting domain-specific tokens and to become worse at generic tokens, especially structural tokens. The COVID-19 protocol appears frequently in training data and so may be seen by the model as a set of domain-specific tokens that should be predicted often.</p></sec><sec id="s4-4-2"><title>Parameter-Efficient Fine-Tuning</title><p>LoRA fine-tuning was faster than full fine-tuning (9.7 minutes vs 28.9 minutes on the GoldCare fine-tuning dataset). Merging adapter weights back into the model added 12 minutes on average. However, despite a minor speed advantage, LoRA fine-tuned models had higher training and evaluation losses than fully fine-tuned models.</p><p>Consistent with these higher losses, the LoRA-trained models generated SOAP notes that were less concise, often overly long, and prone to hallucinating irrelevant or inappropriate text. Many notes failed to follow the correct SOAP format and sometimes included repetitive or nonsensical sections, with placeholder tokens such as &#x201C;[GOAL]&#x201D; or &#x201C;[DATE]&#x201D; appearing in the note. These issues made the LoRA fine-tuned models less suitable for this task than fully fine-tuned models.</p></sec><sec id="s4-4-3"><title>Llama 2 Versus Llama 3</title><p>Comparing the generated notes of the Llama 2 and Llama 3 fine-tuned models, the Llama 3 model generally produced more readable notes. The Llama 3 model was more likely to change the wording of a particular point in a scratch note to improve clarity, while still conveying the original meaning. Furthermore, a large difference between the 2 models was that the fine-tuned Llama 3 model responded more appropriately to instructions after training. The fine-tuned Llama 2 model, like the DAPT models, demonstrated reduced ability to respond to instructions [<xref ref-type="bibr" rid="ref66">66</xref>], most likely due to training on a narrower task. However, the Llama 3 model did not exhibit decreased performance in responding to instructions, instead responding appropriately to different prompts even after fine-tuning. In this way, the fine-tuned Llama 3 model could be prompted to, for instance, use specific headers only or be of a more specific length.</p></sec></sec><sec id="s4-5"><title>Limitations</title><sec id="s4-5-1"><title>Workflow Limitations</title><p>Clinician availability was a limiting factor in this work due to their existing workload. Clinicians were not available for extensive evaluation of the model outputs, nor was there time for clinicians to produce scratch notes for use in training the model. Furthermore, the sequential, nonrandomized nature of the pilot study potentially introduced order effects such that the use of the Copilot model prior to use of the custom model may have influenced how the clinicians used the custom model. Comparative performance and time-saving metrics between the 2 models should be interpreted with this in mind.</p><p>The impact of the system on therapists&#x2019; time was unclear. Timing data from the pilot study were self-reported. The team made a preliminary attempt to collect more accurate timing data through online meetings. A statistical power analysis with G*Power indicated that a sample size of 176 for each category (total 352 data points) would be needed to find a moderate difference. Thus, a much larger sample would be required, and the described findings cannot be considered definitive. However, the limited data collected, and the fact that therapists had reverted to an inefficient process, argue against substantial time savings. In future work, it would be best to revise the process so that therapists provide sparse scratch notes before collecting a larger timing dataset. Additionally, future work could include clinician satisfaction ratings or other usability metrics and feedback to report more thoroughly on the clinician experience outside of time taken to write notes.</p></sec><sec id="s4-5-2"><title>Model Limitations</title><p>Regarding model performance, the choice to train for only a single epoch may have resulted in poorer performance than could have been achieved with further training (eg, with 3 epochs as commonly suggested for such fine-tuning) [<xref ref-type="bibr" rid="ref69">69</xref>,<xref ref-type="bibr" rid="ref70">70</xref>]. There exists an inherent trade-off that in limiting the number of epochs to mitigate memorization and privacy risks, the model&#x2019;s performance and generalizability will likely be poorer. Also, the use of synthetic scratch notes may have resulted in bias and degraded performance as well.</p><p>Finally, this study was focused only on OT SOAP notes for a single institution, and it is not clear whether the same type of model would be generalizable to other settings or disciplines such as physiotherapy. The model would likely not be as useful for other organizations, given that it is trained to mimic the style and culture of the therapists at KidsAbility. A model for a different program at a different organization or for a new discipline would require adaptations but could likely be developed following the same general procedure.</p></sec></sec><sec id="s4-6"><title>Conclusions</title><p>Our customized LLM, which was fine-tuned on pairs of historical progress notes and corresponding synthetic scratch notes, produced progress notes that were similar or better than manually written notes in all the evaluated quality dimensions. Copilot performed similarly but was less concise. Copilot&#x2019;s technical details are not public, but it is likely more resource-intensive to run, by perhaps 2 orders of magnitude. Fine-tuned models such as this have the potential to help health care practitioners in specialized domains to write high-quality documentation, in a cost-effective and efficient manner, without sending PHI to third parties.</p><p>It appeared from the analysis of pilot data and coaching after the pilot study that the model could reduce time spent writing progress notes but only if therapists provided short, sparse scratch notes and spent most of the time editing the generated note. Coaching several therapists to use the model in this way appeared to help them produce progress notes faster. However, the therapists reverted to writing detailed notes during the collection of timing data, and no statistically significant time reduction was found when using the custom model compared with the manual process. When writing longer, more detailed scratch notes, the model primarily improved quality rather than efficiency. Altogether, time savings seemed to depend strongly on therapists letting the model elaborate sparse scratch notes, but they often seemed disinclined to do so.</p><p>To further improve generated progress notes, the model could undergo further fine-tuning on pairs of scratch notes and edited progress notes on an ongoing basis. This might result in progress notes that are even more accurate and more in line with clinician preferences.</p></sec></sec></body><back><ack><p>The authors thank the KidsAbility Research and Innovation team, particularly Aisha Hussain, Cynthia Lennon, and Allison Gaudet, as well as the KidsAbility clinicians who supported this work. The authors also thank Levio Business and Technology for providing access to Amazon Web Services resources in the early stages of this project. Generative AI (ChatGPT) was used conservatively to suggest language and formatting improvements in the review stage.</p><p/></ack><notes><sec><title>Funding</title><p>This work was funded in part by KidsAbility, Mitacs, and Smile Digital Health.</p></sec><sec><title>Data Availability</title><p>The data used in the study cannot be publicly released due to institutional data governance and privacy requirements. However, redacted model training code and note generation code will be made available on GitHub [<xref ref-type="bibr" rid="ref71">71</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>RD, TT, and TY developed the model and application and performed the analysis. IK managed the pilot. BWT conceived and guided the research. BT supervised the research. RD, TT, TY, and BT wrote the paper. All authors reviewed the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CLM</term><def><p>causal language modeling</p></def></def-item><def-item><term id="abb3">DAPT</term><def><p>domain-adaptive pretraining</p></def></def-item><def-item><term id="abb4">EMR</term><def><p>electronic medical record</p></def></def-item><def-item><term id="abb5">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb8">OT</term><def><p>occupational therapy</p></def></def-item><def-item><term id="abb9">PHI</term><def><p>personal health information</p></def></def-item><def-item><term id="abb10">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item><def-item><term id="abb11">SOAP</term><def><p>subjective-objective-assessment-plan</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davidson</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Zwemer</surname><given-names>FL</given-names> </name><name name-style="western"><surname>Nathanson</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Sable</surname><given-names>KN</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>A</given-names> </name></person-group><article-title>Where&#x2019;s the beef? the promise and the reality of clinical documentation</article-title><source>Acad Emerg Med</source><year>2004</year><month>11</month><volume>11</volume><issue>11</issue><fpage>1127</fpage><lpage>1134</lpage><pub-id pub-id-type="doi">10.1197/j.aem.2004.08.004</pub-id><pub-id pub-id-type="medline">15528575</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zierler-Brown</surname><given-names>S</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Blackburn</surname><given-names>RW</given-names> </name></person-group><article-title>Clinical documentation for patient care: models, concepts, and liability considerations for pharmacists</article-title><source>Am J Health Syst Pharm</source><year>2007</year><month>09</month><day>1</day><volume>64</volume><issue>17</issue><fpage>1851</fpage><lpage>1858</lpage><pub-id pub-id-type="doi">10.2146/ajhp060682</pub-id><pub-id pub-id-type="medline">17724368</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blair</surname><given-names>W</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>B</given-names> </name></person-group><article-title>Nursing documentation: frameworks and barriers</article-title><source>Contemp Nurse</source><year>2012</year><month>06</month><volume>41</volume><issue>2</issue><fpage>160</fpage><lpage>168</lpage><pub-id pub-id-type="doi">10.5172/conu.2012.41.2.160</pub-id><pub-id pub-id-type="medline">22800381</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arndt</surname><given-names>BG</given-names> </name><name name-style="western"><surname>Beasley</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Watkinson</surname><given-names>MD</given-names> </name><etal/></person-group><article-title>Tethered to the EHR: primary care physician workload assessment using EHR event log data and time-motion observations</article-title><source>Ann Fam Med</source><year>2017</year><month>09</month><volume>15</volume><issue>5</issue><fpage>419</fpage><lpage>426</lpage><pub-id pub-id-type="doi">10.1370/afm.2121</pub-id><pub-id pub-id-type="medline">28893811</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buchanan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jelsma</surname><given-names>J</given-names> </name><name name-style="western"><surname>Siegfried</surname><given-names>N</given-names> </name></person-group><article-title>Practice-based evidence: evaluating the quality of occupational therapy patient records as evidence for practice</article-title><source>S Afr J Occup Ther</source><year>2016</year><volume>46</volume><issue>1</issue><fpage>65</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.17159/2310-3833/2016/v46n1a13</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shanafelt</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Balch</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Bechamps</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Burnout and medical errors among American surgeons</article-title><source>Ann Surg</source><year>2010</year><month>06</month><volume>251</volume><issue>6</issue><fpage>995</fpage><lpage>1000</lpage><pub-id pub-id-type="doi">10.1097/SLA.0b013e3181bfdab3</pub-id><pub-id pub-id-type="medline">19934755</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dewa</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Jacobs</surname><given-names>P</given-names> </name><name name-style="western"><surname>Thanh</surname><given-names>NX</given-names> </name><name name-style="western"><surname>Loong</surname><given-names>D</given-names> </name></person-group><article-title>An estimate of the cost of burnout on early retirement and reduction in clinical hours of practicing physicians in Canada</article-title><source>BMC Health Serv Res</source><year>2014</year><month>06</month><day>13</day><volume>14</volume><fpage>1</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1186/1472-6963-14-254</pub-id><pub-id pub-id-type="medline">24927847</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ammenwerth</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sp&#x00F6;tl</surname><given-names>HP</given-names> </name></person-group><article-title>The time needed for clinical documentation versus direct patient care</article-title><source>Methods Inf Med</source><year>2009</year><volume>48</volume><issue>1</issue><fpage>84</fpage><lpage>91</lpage><pub-id pub-id-type="doi">10.3414/ME0569</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sinsky</surname><given-names>C</given-names> </name><name name-style="western"><surname>Colligan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Allocation of physician time in ambulatory practice: a time and motion study in 4 specialties</article-title><source>Ann Intern Med</source><year>2016</year><month>12</month><day>6</day><volume>165</volume><issue>11</issue><fpage>753</fpage><lpage>760</lpage><pub-id pub-id-type="doi">10.7326/M16-0961</pub-id><pub-id pub-id-type="medline">27595430</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>Ontario expanding support for children and youth with special needs</article-title><source>Government of Ontario</source><year>2021</year><access-date>2026-04-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://news.ontario.ca/en/release/1000704/ontario-expanding-support-for-children-and-youth-with-special-needs">https://news.ontario.ca/en/release/1000704/ontario-expanding-support-for-children-and-youth-with-special-needs</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Kotyk</surname><given-names>R</given-names> </name></person-group><article-title>Navigating speech therapy in Ontario: wait-list solutions</article-title><source>Voice &#x0026; Speech Therapy Co</source><year>2023</year><access-date>2026-04-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.voiceandspeech.ca/voice-speech-therapy-guide/wait-list">https://www.voiceandspeech.ca/voice-speech-therapy-guide/wait-list</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goffredo</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Bowyer</surname><given-names>P</given-names> </name><name name-style="western"><surname>Reis</surname><given-names>HIS</given-names> </name><name name-style="western"><surname>Humphrey</surname><given-names>J</given-names> </name></person-group><article-title>Pediatric occupational therapists and occupational stress: a scoping review</article-title><source>Occup Ther Health Care</source><year>2024</year><month>07</month><volume>38</volume><issue>3</issue><fpage>827</fpage><lpage>841</lpage><pub-id pub-id-type="doi">10.1080/07380577.2022.2156024</pub-id><pub-id pub-id-type="medline">36524900</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large AI models in health informatics: applications, challenges, and the future</article-title><source>IEEE J Biomed Health Inform</source><year>2023</year><month>12</month><volume>27</volume><issue>12</issue><fpage>6074</fpage><lpage>6087</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2023.3316750</pub-id><pub-id pub-id-type="medline">37738186</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ouyang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>D</given-names> </name></person-group><article-title>ChatCAD: interactive computer-aided diagnosis on medical image using large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 14, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.07257</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Knoll</surname><given-names>T</given-names> </name><name name-style="western"><surname>Moramarco</surname><given-names>F</given-names> </name><name name-style="western"><surname>Korfiatis</surname><given-names>AP</given-names> </name><etal/></person-group><article-title>User-driven research of medical note generation software</article-title><source>arXiv</source><comment>Preprint posted online on  May 5, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2205.02549</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Giorgi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Toma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>R</given-names> </name><etal/></person-group><article-title>WangLab at MEDIQA-chat 2023: clinical note generation from doctor-patient conversations using large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 3, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.02220</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Biswas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Talukdar</surname><given-names>W</given-names> </name></person-group><article-title>Intelligent clinical documentation: harnessing generative AI for patient-centric clinical note generation</article-title><source>arXiv</source><comment>Preprint posted online on  May 28, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.18346</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Goyal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rastogi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rajagopal</surname><given-names>SP</given-names> </name><etal/></person-group><article-title>HealAI: a healthcare LLM for effective medical documentation</article-title><year>2024</year><month>03</month><day>4</day><conf-name>Proceedings of the 17th ACM International Conference on Web Search and Data Mining</conf-name><conf-date>Mar 4-8, 2024</conf-date><conf-loc>Merida Mexico</conf-loc><fpage>1167</fpage><lpage>1168</lpage><pub-id pub-id-type="doi">10.1145/3616855.3635739</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ansar</surname><given-names>W</given-names> </name><name name-style="western"><surname>Goswami</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chakrabarti</surname><given-names>A</given-names> </name></person-group><article-title>A survey on transformers in NLP with focus on efficiency</article-title><source>arXiv</source><comment>Preprint posted online on  May 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.16893</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Judge</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Krewer</surname><given-names>F</given-names> </name><name name-style="western"><surname>O&#x2019;Donnell</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>Multimodal artificial intelligence in medicine</article-title><source>Kidney360</source><year>2024</year><month>11</month><day>1</day><volume>5</volume><issue>11</issue><fpage>1771</fpage><lpage>1779</lpage><pub-id pub-id-type="doi">10.34067/KID.0000000000000556</pub-id><pub-id pub-id-type="medline">39167446</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rastogi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Naik</surname><given-names>G</given-names> </name><etal/></person-group><article-title>A continued pretrained LLM approach for automatic medical note generation</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 14, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.09057</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Podder</surname><given-names>V</given-names> </name><name name-style="western"><surname>Lew</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ghassemzadeh</surname><given-names>S</given-names> </name></person-group><article-title>SOAP notes</article-title><source>StatPearls</source><year>2024</year><access-date>2026-05-08</access-date><publisher-name>StatPearls Publishing</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/books/NBK482263/">http://www.ncbi.nlm.nih.gov/books/NBK482263/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>DiMaio</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tuinstra</surname><given-names>T</given-names> </name><name name-style="western"><surname>Trevor</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Brendan</surname><given-names>WT</given-names> </name><name name-style="western"><surname>Tripp</surname><given-names>BP</given-names> </name></person-group><article-title>Custom language model for pediatric occupational therapy documentation [master&#x2019;s thesis]</article-title><year>2024</year><publisher-name>University of Waterloo</publisher-name></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Finley</surname><given-names>G</given-names> </name><name name-style="western"><surname>Edwards</surname><given-names>E</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>A</given-names> </name><etal/></person-group><article-title>An automated medical scribe for documenting clinical encounters</article-title><year>2018</year><conf-name>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Demonstrations</conf-name><conf-date>Jun 2018</conf-date><conf-loc>New Orleans, Louisiana</conf-loc><fpage>11</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.18653/v1/N18-5003</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Krishna</surname><given-names>K</given-names> </name><name name-style="western"><surname>Khosla</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bigham</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Lipton</surname><given-names>ZC</given-names> </name></person-group><article-title>Generating SOAP notes from doctor-patient conversations using modular summarization techniques</article-title><source>arXiv</source><comment>Preprint posted online on  May 4, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.01795</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Su</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hassanzadeh</surname><given-names>HR</given-names> </name><name name-style="western"><surname>Schaaf</surname><given-names>T</given-names> </name></person-group><article-title>Extract and abstract with BART for clinical notes from doctor-patient conversations</article-title><access-date>2026-05-08</access-date><conf-name>Interspeech 2022</conf-name><conf-date>Sep 18-22, 2022</conf-date><fpage>2488</fpage><lpage>2492</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.isca-archive.org/interspeech_2022/su22b_interspeech.html#">https://www.isca-archive.org/interspeech_2022/su22b_interspeech.html#</ext-link></comment><pub-id pub-id-type="doi">10.21437/Interspeech.2022-10935</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ramprasad</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ferracane</surname><given-names>E</given-names> </name><name name-style="western"><surname>Selvaraj</surname><given-names>SP</given-names> </name></person-group><article-title>Generating more faithful and consistent SOAP notes using attribute-specific parameters</article-title><year>2023</year><access-date>2025-01-15</access-date><conf-name>Proceedings of the 8th Machine Learning for Healthcare Conference</conf-name><conf-date>Aug 11-12, 2023</conf-date><conf-loc>New York, USA</conf-loc><fpage>631</fpage><lpage>649</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v219/ramprasad23a.html">https://proceedings.mlr.press/v219/ramprasad23a.html</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rennard</surname><given-names>V</given-names> </name><name name-style="western"><surname>Shang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hunter</surname><given-names>J</given-names> </name><name name-style="western"><surname>Vazirgiannis</surname><given-names>M</given-names> </name></person-group><article-title>Abstractive meeting summarization: a survey</article-title><source>Trans Assoc Comput Linguist</source><year>2023</year><month>07</month><day>25</day><volume>11</volume><fpage>861</fpage><lpage>884</lpage><pub-id pub-id-type="doi">10.1162/tacl_a_00578</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Enarvi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Amoia</surname><given-names>M</given-names> </name><name name-style="western"><surname>Del-Agua Teba</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Generating medical reports from patient-doctor conversations using sequence-to-sequence models</article-title><year>2020</year><conf-name>Proceedings of the First Workshop on Natural Language Processing for Medical Conversations</conf-name><conf-date>Jul 2020</conf-date><conf-loc>Online</conf-loc><fpage>22</fpage><lpage>30</lpage><pub-id pub-id-type="doi">10.18653/v1/2020.nlpmc-1.4</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ben Abacha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yim</surname><given-names>W wai</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>G</given-names> </name><name name-style="western"><surname>Snider</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yetisgen</surname><given-names>M</given-names> </name></person-group><article-title>Overview of the MEDIQA-chat 2023 shared tasks on the summarization &#x0026; generation of doctor-patient conversations</article-title><year>2023</year><conf-name>Proceedings of the 5th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jul 14, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><fpage>503</fpage><lpage>513</lpage><pub-id pub-id-type="doi">10.18653/v1/2023.clinicalnlp-1.52</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>YW</given-names> </name><name name-style="western"><surname>Hirschberg</surname><given-names>J</given-names> </name></person-group><article-title>Exploring robustness in doctor-patient conversation summarization: an analysis of out-of-domain SOAP notes</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 5, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.02826</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ju</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name></person-group><article-title>Exploring the potential of ChatGPT in medical dialogue summarization: a study on consistency with human preferences</article-title><source>BMC Med Inform Decis Mak</source><year>2024</year><month>03</month><day>14</day><access-date>2026-05-08</access-date><volume>24</volume><issue>1</issue><fpage>75</fpage><comment><ext-link ext-link-type="uri" xlink:href="https://api.semanticscholar.org/CorpusID:268387676">https://api.semanticscholar.org/CorpusID:268387676</ext-link></comment><pub-id pub-id-type="doi">10.1186/s12911-024-02481-8</pub-id><pub-id pub-id-type="medline">38486198</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>AI medical scribe</article-title><source>DeepScribe</source><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.deepscribe.ai/">https://www.deepscribe.ai/</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>AI ambient scribe and AI medical dictation</article-title><source>Tali</source><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://tali.ai/">https://tali.ai/</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Dragon medical one&#x2014;#1 clinical documentation companion | Nuance</article-title><source>Nuance Communication</source><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nuance.com/healthcare/dragon-ai-clinical-solutions/dragon-medical-one.html">https://www.nuance.com/healthcare/dragon-ai-clinical-solutions/dragon-medical-one.html</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><article-title>AI-powered medical transcription and note generation</article-title><source>AutoScribe</source><access-date>2025-02-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mutuohealth.com/">https://www.mutuohealth.com/</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Buchem</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Kant</surname><given-names>IMJ</given-names> </name><name name-style="western"><surname>King</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kazmaier</surname><given-names>J</given-names> </name><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Bauer</surname><given-names>MP</given-names> </name></person-group><article-title>Impact of a digital scribe system on clinical documentation time and quality: usability study</article-title><source>JMIR AI</source><year>2024</year><month>09</month><day>23</day><volume>3</volume><issue>1</issue><fpage>e60020</fpage><pub-id pub-id-type="doi">10.2196/60020</pub-id><pub-id pub-id-type="medline">39312397</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Soboczenski</surname><given-names>F</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name></person-group><article-title>Generating (factual?) narrative summaries of RCTs: experiments with neural multi-document summarization</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2021</year><volume>2021</volume><fpage>605</fpage><lpage>614</lpage><pub-id pub-id-type="medline">34457176</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Goldberg</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>I</given-names> </name></person-group><source>The AI Revolution in Medicine: GPT-4 and Beyond</source><year>2023</year><access-date>2026-05-08</access-date><publisher-name>Pearson</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://books.google.ca/books/about/The_AI_Revolution_in_Medicine.html?id=ZtnPEAAAQBAJ&#x0026;redir_esc=y">https://books.google.ca/books/about/The_AI_Revolution_in_Medicine.html?id=ZtnPEAAAQBAJ&#x0026;redir_esc=y</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Du</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Testing and evaluation of generative large language models in electronic health record applications: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2026</year><month>03</month><day>1</day><volume>33</volume><issue>3</issue><fpage>743</fpage><lpage>753</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf233</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koohi Habibi Dehkordi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Perl</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Deek</surname><given-names>FP</given-names> </name><etal/></person-group><article-title>Improving large language models&#x2019; summarization accuracy by adding highlights to discharge notes: comparative evaluation</article-title><source>JMIR Med Inform</source><year>2025</year><month>07</month><day>24</day><volume>13</volume><issue>1</issue><fpage>e66476</fpage><pub-id pub-id-type="doi">10.2196/66476</pub-id><pub-id pub-id-type="medline">40705416</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Van Veen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Uden</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blankemeier</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Adapted large language models can outperform medical experts in clinical text summarization</article-title><source>Nat Med</source><year>2024</year><month>04</month><volume>30</volume><issue>4</issue><fpage>1134</fpage><lpage>1142</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-02855-5</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bednarczyk</surname><given-names>L</given-names> </name><name name-style="western"><surname>Reichenpfader</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gaudet-Blavignac</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Scientific evidence for clinical text summarization using large language models: scoping review</article-title><source>J Med Internet Res</source><year>2025</year><month>05</month><day>15</day><volume>27</volume><issue>1</issue><fpage>e68998</fpage><pub-id pub-id-type="doi">10.2196/68998</pub-id><pub-id pub-id-type="medline">40371947</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>E</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Assessment and integration of large language models for automated electronic health record documentation in emergency medical services</article-title><source>J Med Syst</source><year>2025</year><month>05</month><day>17</day><volume>49</volume><issue>1</issue><fpage>65</fpage><pub-id pub-id-type="doi">10.1007/s10916-025-02197-w</pub-id><pub-id pub-id-type="medline">40381087</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Wu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Spiliopoulou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name></person-group><article-title>Improving clinical note generation from complex doctor-patient conversation</article-title><source>Advances in Knowledge Discovery and Data Mining (2017)</source><publisher-name>Springer-Verlag</publisher-name><fpage>209</fpage><lpage>221</lpage><pub-id pub-id-type="doi">10.1007/978-981-96-8186-0_17</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Goyal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rastogi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Beinstein</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Thompson</surname><given-names>P</given-names> </name><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>D</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>P</given-names> </name></person-group><article-title>SpecialtyScribe: enhancing SOAP note scribing for medical specialties using LLM&#x2019;s</article-title><year>2025</year><access-date>2026-05-08</access-date><conf-name>Proceedings of the Second Workshop on Patient-Oriented Language Processing (CL4Health)</conf-name><conf-date>May 3-4, 2025</conf-date><conf-loc>Albuquerque, NM</conf-loc><fpage>34</fpage><lpage>45</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.cl4health-1">https://aclanthology.org/2025.cl4health-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2025.cl4health-1.4</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kamal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Oates</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>J</given-names> </name></person-group><article-title>Towards scalable SOAP note generation: a weakly supervised multimodal framework</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 12, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2506.10328</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Bag of tricks for training data extraction from language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 9, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.04460</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heider</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Obeid</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Meystre</surname><given-names>SM</given-names> </name></person-group><article-title>A comparative analysis of speed and accuracy for three off-the-shelf de-identification tools</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2020</year><volume>2020</volume><fpage>241</fpage><lpage>250</lpage><pub-id pub-id-type="medline">32477643</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  May 20, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>M</given-names> </name></person-group><article-title>Synthetic data generation with large language models for text classification: potential and limitations</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 11, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.07849</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>J</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>Generating training data with language models: towards zero-shot language understanding</article-title><year>2022</year><access-date>2026-05-08</access-date><conf-name>Advances in Neural Information Processing Systems 35</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><conf-loc>New Orleans, Louisiana, USA</conf-loc><fpage>462</fpage><lpage>477</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://www.proceedings.com/68431.html">http://www.proceedings.com/68431.html</ext-link></comment><pub-id pub-id-type="doi">10.52202/068431-0034</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Meng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Michalski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Abdelzaher</surname><given-names>T</given-names> </name><name name-style="western"><surname>Han</surname><given-names>J</given-names> </name></person-group><article-title>Tuning language models as training data generators for augmentation-enhanced few-shot learning</article-title><year>2023</year><access-date>2026-04-23</access-date><conf-name>Proceedings of the 40th International Conference on Machine Learning</conf-name><conf-date>Jul 23-29, 2023</conf-date><conf-loc>Honolulu, Hawaii, USA</conf-loc><fpage>24457</fpage><lpage>24477</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v202/meng23b.html">https://proceedings.mlr.press/v202/meng23b.html</ext-link></comment></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ducharme</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vincent</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jauvin</surname><given-names>C</given-names> </name></person-group><article-title>A neural probabilistic language model</article-title><source>J Mach Learn Res</source><year>2003</year><volume>3</volume><issue>Feb</issue><fpage>1137</fpage><lpage>1155</lpage><pub-id pub-id-type="doi">10.1162/153244303322533223</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ippolito</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jagielski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tram&#x00E8;r</surname><given-names>F</given-names> </name><name name-style="western"><surname>Carlini</surname><given-names>N</given-names> </name></person-group><article-title>Counterfactual memorization in neural language models</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 24, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2112.12938</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kandpal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>E</given-names> </name><name name-style="western"><surname>Raffel</surname><given-names>C</given-names> </name></person-group><article-title>Deduplicating training data mitigates privacy risks in language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 14, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2202.06539</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gururangan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Marasovi&#x0107;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Swayamdipta</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Don&#x2019;t stop pre-training: adapt language models to domains and tasks</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 23, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2004.10964</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 17, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dettmers</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pagnoni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Holtzman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zettlemoyer</surname><given-names>L</given-names> </name></person-group><article-title>QLoRA: efficient finetuning of quantized LLMS</article-title><source>arXiv</source><comment>Preprint posted online on  May 23, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.14314</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>A survey of large language models in medicine: progress, application, and challenge</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 9, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.05112</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Amenyo</surname><given-names>S</given-names> </name></person-group><article-title>Bridging technology and therapy: assessing the quality and analyzing the impact of human editing on AI-generated SOAP notes in pediatric rehabilitation</article-title><year>2025</year><access-date>2026-05-08</access-date><publisher-name>University of Waterloo</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/10012/21567">https://hdl.handle.net/10012/21567</ext-link></comment></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>CY</given-names> </name></person-group><article-title>ROUGE: a package for automatic evaluation of summaries</article-title><year>2004</year><access-date>2026-05-08</access-date><conf-name>Text Summarization Branches Out Association for Computational Linguistics</conf-name><conf-date>Jul 25-26, 2004</conf-date><conf-loc>Barcelona, Spain</conf-loc><fpage>74</fpage><lpage>81</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W04-1013">https://aclanthology.org/W04-1013</ext-link></comment></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giarelis</surname><given-names>N</given-names> </name><name name-style="western"><surname>Mastrokostas</surname><given-names>C</given-names> </name><name name-style="western"><surname>Karacapilidis</surname><given-names>N</given-names> </name></person-group><article-title>Abstractive vs. extractive summarization: an experimental review</article-title><source>Appl Sci (Basel)</source><year>2023</year><volume>13</volume><issue>13</issue><fpage>7620</fpage><pub-id pub-id-type="doi">10.3390/app13137620</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cheng</surname><given-names>D</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>F</given-names> </name></person-group><article-title>Adapting large language models via reading comprehension</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.09530</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mitigating catastrophic forgetting in large language models with self-synthesized rehearsal</article-title><source>arXiv</source><comment>Preprint posted online on  March 2, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.01244</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>&#x00D6;ncel</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bethge</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ermis</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ravanelli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Subakan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Y&#x0131;ld&#x0131;z</surname><given-names>&#x00C7;</given-names> </name></person-group><article-title>Adaptation odyssey in LLMs: why does additional pretraining sometimes fail to improve?</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 16, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.05581</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Pareja</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nayak</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Unveiling the secret recipe: a guide for supervised fine-tuning small LLMs</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.13337</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Meta</collab></person-group><article-title>Fine-tuning Llama guide</article-title><source>GitHub</source><year>2026</year><access-date>2026-01-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/finetuning">https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/finetuning</ext-link></comment></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>DiMaio</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Custom LLM for pediatric occupational therapy</article-title><source>GitHub</source><year>2026</year><access-date>2026-04-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/rmdim/custom-llm-for-pediatric-ot#custom-llm-for-pediatric-occupational-therapy">https://github.com/rmdim/custom-llm-for-pediatric-ot#custom-llm-for-pediatric-occupational-therapy</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Model training implementation and hyperparameters.</p><media xlink:href="ai_v5i1e73274_app1.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Generated subjective-objective-assessment-plan note examples.</p><media xlink:href="ai_v5i1e73274_app2.doc" xlink:title="DOC File, 36 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Scratch note examples.</p><media xlink:href="ai_v5i1e73274_app3.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>ANOVA and intraclass correlation coefficient results for subjective-objective-assessment-plan note quality assessment.</p><media xlink:href="ai_v5i1e73274_app4.docx" xlink:title="DOCX File, 28 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Domain-adaptive fine-tuning example.</p><media xlink:href="ai_v5i1e73274_app5.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>