<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e77351</article-id><article-id pub-id-type="doi">10.2196/77351</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>AI-Assisted Medical Documentation in a Multilingual Swiss Health Care System: Proof-of-Concept Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>G&#x0142;adysz</surname><given-names>Mateusz</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fiumedinisi</surname><given-names>Fabrizio</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Burn</surname><given-names>Felice</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rommers</surname><given-names>Nikki</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Giovanoli</surname><given-names>Pietro</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Plock</surname><given-names>Jan Alexander</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Plastic Surgery and Hand Surgery, Kantonsspital Aarau</institution><addr-line>Tellstrasse 25</addr-line><addr-line>Aarau</addr-line><country>Switzerland</country></aff><aff id="aff2"><institution>AI &#x0026; Data Science CoE, Kantonsspital Aarau</institution><addr-line>Aarau</addr-line><country>Switzerland</country></aff><aff id="aff3"><institution>University of Basel</institution><addr-line>Basel</addr-line><addr-line>Basel-City</addr-line><country>Switzerland</country></aff><aff id="aff4"><institution>Department of Plastic Surgery and Hand Surgery, University Hospital of Zurich</institution><addr-line>Zurich</addr-line><country>Switzerland</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bhowmick</surname><given-names>Anirban</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Ankit</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Annan</surname><given-names>Charlotte</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Emekli</surname><given-names>Emre</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zoubi</surname><given-names>Mohammad Al</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Acharya</surname><given-names>Nirajan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ajibade</surname><given-names>Victoria</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mateusz G&#x0142;adysz, MD, Department of Plastic Surgery and Hand Surgery, Kantonsspital Aarau, Tellstrasse 25, Aarau, 5001, Switzerland; <email>mateusz.gladysz@hotmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>5</day><month>6</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e77351</elocation-id><history><date date-type="received"><day>13</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>31</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>31</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Mateusz G&#x0142;adysz, Fabrizio Fiumedinisi, Felice Burn, Nikki Rommers, Pietro Giovanoli, Jan Alexander Plock. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 5.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e77351"/><abstract><sec><title>Background</title><p>Medical documentation imposes a significant administrative burden on physicians and reduces time for direct patient care. Artificial intelligence (AI)-assisted tools such as automatic speech recognition and large language models (LLMs) promise to reduce this burden, but their performance in multilingual environments has not been explored. Switzerland is highly multilingual, and non-native German-speaking physicians may find documentation particularly challenging.</p></sec><sec><title>Objective</title><p>This study aimed to compare the efficiency and documentation quality of four clinical documentation workflows&#x2014;including both AI-assisted and traditional methods&#x2014;in a Swiss tertiary hospital setting characterized by linguistic diversity.</p></sec><sec sec-type="methods"><title>Methods</title><p>In this proof-of-concept study at a Swiss tertiary hospital (Department of Plastic and Hand Surgery, Cantonal Hospital Aarau), two physicians&#x2014;a native Swiss German speaker and a non-native German speaker&#x2014;documented encounters with simulated patients having common hand disorders. Four documentation workflows were tested: (1) traditional dictation with transcription by a secretary; (2) real-time dictation using speech recognition software for voice to text transcription; (3) postencounter dictation transcribed by an AI (Whisper) and processed by a GPT-based agent; and (4) AI-assisted ambient dictation of entire appointments using audio recording and automatic transcription. Documentation efficiency was measured by recorded physician time, and note quality was assessed using a modified Physician Documentation Quality Instrument (PDQI-9) scored by three different LLMs. To protect patient privacy, only synthetic (simulated) patient data were used.</p></sec><sec sec-type="results"><title>Results</title><p>AI-assisted workflows&#x2014;particularly workflow 4 (AI-assisted ambient dictation)&#x2014;produced the shortest physician documentation times per report. In post-hoc comparisons, workflow 4 was significantly faster than solely the speech recognition software workflow (workflow 2) for both physicians (adjusted <italic>P</italic>&#x003C;.001). For the non-native speaker, workflow 4 was not significantly faster than traditional dictation (workflow 1) after adjustment (<italic>P</italic>=.08). LLM evaluators assigned high absolute scores (median PDQI-9 &#x003E;47/50); however, inter-rater reliability was poor (Krippendorff&#x2019;s alpha=&#x2212;.433, 95% CI: &#x2212;0.444 to &#x2212;0.416), indicating systematic disagreement that precludes definitive conclusions about documentation quality from these scores alone.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>AI-assisted documentation demonstrated significant time savings for the native speaker, though the reduction for the non-native speaker did not reach statistical significance in this pilot (<italic>P</italic>=.08). Such tools show potential to alleviate the linguistic challenges faced by non-native speakers, reduce administrative burdens, and enable physicians to spend more time with patients. However, the inconsistency of AI-based quality scoring suggests that LLMs cannot yet reliably replace human evaluation. Future studies should evaluate these workflows in real-world clinical implementation, address data privacy and security issues, and include human evaluators to validate the benefits observed in this study.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>natural language processing</kwd><kwd>speech recognition software</kwd><kwd>electronic health records</kwd><kwd>documentation</kwd><kwd>multilingualism</kwd><kwd>efficiency</kwd><kwd>organizational</kwd><kwd>burnout, professional</kwd><kwd>Switzerland</kwd><kwd>generative AI scribes</kwd><kwd>ambient clinical documentation</kwd><kwd>comparative study</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Medical documentation is essential for health care delivery but imposes a substantial administrative burden on physicians. Studies indicate that growing documentation requirements greatly reduce time available for direct patient care, with estimates that physicians spend nearly twice as much time on administrative tasks as on patient interaction [<xref ref-type="bibr" rid="ref1">1</xref>]. This imbalance contributes to physician burnout&#x2014;a rising concern in health care systems worldwide, including in Switzerland [<xref ref-type="bibr" rid="ref2">2</xref>]. Burnout adversely affects physicians&#x2019; well-being and has been linked to compromised patient safety, lower care quality, and reduced health care efficiency.</p><p>Switzerland&#x2019;s linguistic diversity creates unique challenges for health care communication and documentation. The country has four national languages (ie, German, French, Italian, and Romansh) and numerous regional dialects [<xref ref-type="bibr" rid="ref3">3</xref>]. In German-speaking regions, physicians and patients often switch between Swiss German dialects and standard German, complicating both verbal communication and written notes. English is also commonly used as a lingua franca in medical settings [<xref ref-type="bibr" rid="ref4">4</xref>]. Non-native speakers comprise a significant portion of the Swiss medical workforce&#x2014;over 40% of physicians practicing in Switzerland received their medical education abroad [<xref ref-type="bibr" rid="ref5">5</xref>], &#x2014;and they face added difficulties documenting and communicating efficiently across multiple languages and dialects. Patients further contribute to this linguistic complexity by speaking various dialects or languages, adding another layer of difficulty to clinical interactions. For example, patients in German-speaking areas might use local Swiss German dialects, standard German, or even Italian or French. The multiplicity of languages and dialects makes accurate, efficient documentation challenging, particularly when using speech recognition technologies not optimized for regional dialects or code-switching. These linguistic factors can increase documentation time, cause misunderstandings, and lead to variability in the quality of medical records.</p><p>Advancements in artificial intelligence (AI)&#x2014;particularly in natural language processing and speech recognition&#x2014;offer promising solutions to ease the documentation burden [<xref ref-type="bibr" rid="ref6">6</xref>] Recent competitive analyses show that commercial AI scribes can generate notes rapidly (&#x2248;1 min for standardized 15-min encounters) but performance and error profiles vary between products, warranting ongoing evaluation [<xref ref-type="bibr" rid="ref7">7</xref>]. Tools such as large language models (LLMs) and speech-to-text systems can generate high-quality medical notes and reduce documentation time, and some are already in commercial use [<xref ref-type="bibr" rid="ref8">8</xref>] These AI-assisted tools can transcribe spoken language, interpret clinical context, and produce structured notes that integrate with electronic health record systems. However, most studies of such technologies have been conducted in monolingual, predominantly English-speaking environments. A significant gap in the literature remains regarding their effectiveness in multilingual, linguistically complex settings like Switzerland, where the mix of languages and dialects poses additional challenges for AI models trained primarily on standard language data.</p><p>Moreover, implementing AI-assisted documentation tools raises important ethical and practical considerations [<xref ref-type="bibr" rid="ref9">9</xref>]. Data privacy and security are paramount, especially when sensitive patient information is processed via cloud-based AI services [<xref ref-type="bibr" rid="ref10">10</xref>]. Compliance with regulations such as the European Union&#x2019;s General Data Protection Regulation (GDPR) is essential to protect patient confidentiality [<xref ref-type="bibr" rid="ref11">11</xref>]. Additionally, AI models may exhibit biases or performance limitations when handling languages or dialects that were not well represented in their training data [<xref ref-type="bibr" rid="ref12">12</xref>]. These concerns must be addressed to ensure the safe and effective integration of AI technologies into clinical practice.</p><p>This study aims to evaluate the efficiency and quality of AI-assisted medical documentation workflows in a linguistically diverse Swiss tertiary hospital setting. By comparing traditional and AI-assisted documentation methods used by both native and non-native German-speaking physicians, we seek to understand how these technologies perform in a real-world multilingual clinical environment. The findings are intended to inform the implementation of AI documentation tools in health care systems facing similar linguistic challenges, ultimately contributing to improved physician efficiency and patient care.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>We conducted a proof-of-concept observational study at the Department of Plastic and Hand Surgery, Cantonal Hospital Aarau AG (Aarau, Switzerland). The primary aim was to compare the efficiency and documentation quality of four medical documentation workflows (traditional vs AI-assisted) in a controlled, simulated clinical setting involving common hand disorders. This study design enabled a standardized comparison of workflows while adhering to ethical requirements for patient privacy and data protection.</p></sec><sec id="s2-2"><title>Participants and Generation of Simulated Encounters</title><p>The study involved two physician participants: one native Swiss German&#x2013;speaking consultant in hand surgery and one non-native German&#x2013;speaking consultant in plastic surgery. Both physicians were experienced clinicians in the department with equivalent professional experience. We conducted twelve simulated patient encounters featuring common hand disorders such as trigger finger, Dupuytren contracture, de Quervain tenosynovitis, and carpal tunnel syndrome. Clinical scenarios and referral letters were initially generated using ChatGPT (GPT-4), then carefully reviewed for clinical accuracy by the first author.</p><p>Simulated patients were portrayed by residents and interns familiar with the medical conditions and daily hand surgery practice. Each consultant completed encounters with six simulated cases, ensuring balanced linguistic diversity per consultant as follows:</p><list list-type="bullet"><list-item><p>Standard German speakers (n=2): actors who spoke standard German as their first language.</p></list-item><list-item><p>Non-native German speakers (n=2): actors who spoke German as a second language (first language Italian), representing individuals from Ticino or Italy.</p></list-item><list-item><p>Swiss German dialect speakers (n=2): actors who spoke different Swiss German dialects (eg, from Aargau and Bern).</p></list-item></list><p>Some actors portrayed scenarios for both consultants, although not all did. Specific actor-case assignments were not systematically recorded.</p><p>This diversity in linguistic backgrounds was designated to simulate realistic clinical interactions, ensuring each documentation workflow was tested under representative conditions reflective of Switzerland&#x2019;s multilingual health care environment.</p></sec><sec id="s2-3"><title>Workflows Tested</title><p>We compared four documentation workflows, each with a different level of technological assistance :</p><p>Workflow 1: Traditional Dictation to a Secretary &#x2013; After each simulated patient encounter, the physician dictated the clinical notes, which were transcribed by a medical secretary. The physician later reviewed and corrected the transcribed note in the electronic health record (EHR) system.</p><p>Workflow 2: Speech Recognition Software &#x2013; The physician used Dragon Medical One&#x00AE; speech recognition software to dictate notes directly into the EHR during or immediately after each encounter, correcting errors in real time without a secretary.</p><p>Workflow 3: AI Transcription + GPT Agent &#x2013; After each encounter, the physician dictated the notes, which were transcribed using OpenAI Whisper (Large v3) speech-to-text. A custom GPT-based agent then processed the transcript to generate a structured draft of the note. The physician reviewed and edited the AI-generated note before finalizing it in the EHR. Whisper Large v3 was used out of the box, with no fine-tuning, vocabulary customization, or site-specific optimization.</p><p>Workflow 4: Ambient Dictation &#x2013; The entire patient&#x2013;physician encounter was audio-recorded (using a smartphone app). After the appointment, the physician added any examination details by dictating into the recorded audio. The recording was transcribed using the open-source Whisper Large v3 model, and the transcript was then processed by a custom GPT-based system (MediBrief Creator&#x00A9;) to produce a draft note. The physician reviewed the draft for accuracy and added any corrections before integrating the note into the EHR. Whisper Large v3 was used out of the box, with no fine-tuning, vocabulary customization, or site-specific optimization.</p><sec id="s2-3-1"><title>System Instructions for &#x201C;MediBrief Creator&#x201D;</title><sec id="s2-3-1-1"><title>Role</title><p>Create &#x201C;Arztbriefe&#x201D; (medical reports) in German from transcripts.</p><p>Structure:</p><list list-type="bullet"><list-item><p><italic>Hauptdiagnose/Nebendiagnose:</italic> Diagnosis with ICD-10 code</p></list-item><list-item><p><italic>Anamnese:</italic> Subjective history (complete sentences, past tense/subjunctive)</p></list-item><list-item><p><italic>Befund:</italic> Objective findings (complete sentences, no bullet points)</p></list-item><list-item><p><italic>Beurteilung/Procedere:</italic> Summary and plan</p></list-item></list></sec><sec id="s2-3-1-2"><title>Key Guidelines</title><p>Use formal terminology; do not fabricate information; preserve uncertainty if transcript is unclear; &#x201C;as much as necessary, as little as possible.&#x201D;</p></sec></sec></sec><sec id="s2-4"><title>LLM Pipeline (Reproducibility)</title><p>Audio was transcribed with Whisper Large v3 and then summarized by a fixed, GPT 4o -based agent (&#x201C;MediBrief Creator&#x201D; [<xref ref-type="bibr" rid="ref13">13</xref>]), using a stable prompt stack: (1) section templating (History, Exam, Assessment, Plan); (2) conservative synthesis (&#x201C;preserve uncertainty; do not invent&#x201D;); (3) problem-oriented summary with code suggestions; (4) style normalization; and (5) self-flagging for low-confidence segments. The same prompts/parameters were used for all cases; no mid-study prompt editing occurred (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Key characteristics of each workflow (physician&#x2019;s role, technology used, and notable features).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Attribute</td><td align="left" valign="bottom">Workflow 1: Traditional dictation to secretary</td><td align="left" valign="bottom">Workflow 2: Dragon Medical One software</td><td align="left" valign="bottom">Workflow 3: Dictation with Whisper V3 and GPT Agent</td><td align="left" valign="bottom">Workflow 4: Ambient dictation with full appointment recording and GPT processing</td></tr></thead><tbody><tr><td align="left" valign="top">Description</td><td align="left" valign="top">Physicians dictate notes post-encounter; secretaries transcribe and return for review.</td><td align="left" valign="top">Physicians use speech recognition software to dictate directly into EHR<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, correcting errors in real-time.</td><td align="left" valign="top">Physicians dictate notes; AI<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> transcribes and generates structured notes for review.</td><td align="left" valign="top">Entire patient encounters are recorded; AI transcribes and generates notes; physicians add missing details post-encounter.</td></tr><tr><td align="left" valign="top">Physician's role</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Dictate patient notes</p></list-item><list-item><p>Review and correct transcribed notes</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Dictate notes using Dragon Medical One</p></list-item><list-item><p>Edit and correct transcribed text within the EHR</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Dictate patient notes</p></list-item><list-item><p>Review and correct AI-generated notes</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Conduct patient encounters</p></list-item><list-item><p>Provide additional clinical details post-encounter</p></list-item><list-item><p>Review and correct AI-generated notes</p></list-item></list></td></tr><tr><td align="left" valign="top">Technology used</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Digital voice recorder</p></list-item><list-item><p>Human transcription by secretary</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Dragon Medical One software</p></list-item><list-item><p>EHR system</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>OpenAI Whisper V3 (speech-to-text)</p></list-item><list-item><p>Custom GPT agent (note generation)</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Smartphone recording app</p></list-item><list-item><p>OpenAI Whisper V3</p></list-item><list-item><p>Custom GPT agent</p></list-item></list></td></tr><tr><td align="left" valign="top">Key features</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Established practice</p></list-item><list-item><p>Division of labor</p></list-item><list-item><p>Potential delays due to transcription time</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Real-time transcription</p></list-item><list-item><p>Immediate self-correction</p></list-item><list-item><p>No secretary involvement</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>AI-assisted transcription and note generation</p></list-item><list-item><p>Structured notes with minimal formatting effort</p></list-item><list-item><p>Inclusion of <italic>ICD-10</italic> codes by AI</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Passive recording of encounters</p></list-item><list-item><p>AI-generated notes from full transcripts</p></list-item><list-item><p>Minimal active documentation during encounters</p></list-item><list-item><p>Inclusion of <italic>ICD-10</italic> codes by AI</p></list-item></list></td></tr><tr><td align="left" valign="top">Rationale</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Serves as control workflow</p></list-item><list-item><p>Reflects standard practice</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Tests efficiency of speech recognition technology without AI language models</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Evaluates combined use of advanced speech-to-text and AI language models</p></list-item><list-item><p>Aims to reduce documentation time and effort</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Tests the concept of an ambient AI scribe</p></list-item><list-item><p>Aims to further reduce active documentation time</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>EHR: electronic health record.</p></fn><fn id="table1fn2"><p><sup>b</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Efficiency Measurement</title><p>An independent observer measured the time each physician spent on documentation tasks in each workflow using a stopwatch. For Workflow 2, 'dictation&#x2019; was defined as the time the microphone was active, while &#x2019;correction&#x2019; was defined as time spent on manual input (typing/mouse); we acknowledge these often overlap in practice. Background processing time (eg, AI transcription or note generation by the GPT agent) was not counted, under the assumption that those processes occur automatically without the physician&#x2019;s active involvement. This measurement method isolated the physician&#x2019;s active documentation time for each workflow.</p></sec><sec id="s2-6"><title>Quality Assessment</title><p>Note quality was scored using a modified PDQI-9 [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] approach previously used for AI-scribe evaluation. We rated 10 criteria on 5-point Likert scales (total 0&#x2010;50): Accuracy, Thoroughness, Usefulness, Organization, Comprehensibility, Succinctness, Synthesis, Internal Consistency, Lack of Hallucination, Lack of Bias. No item weighting was used. We reused the published &#x201C;modified PDQI-9&#x201D; approach and list the full criteria here to ensure reproducibility without an appendix.</p></sec><sec id="s2-7"><title>LLM Raters</title><p>Three independent LLMs (Claude 3 [<xref ref-type="bibr" rid="ref15">15</xref>], OpenAI o1-preview [<xref ref-type="bibr" rid="ref16">16</xref>], and GPT-4 [<xref ref-type="bibr" rid="ref17">17</xref>]) rated all notes. Inter-rater reliability was analyzed separately in the Statistical Analysis section using Krippendorff&#x2019;s &#x03B1;.</p></sec><sec id="s2-8"><title>Quality Assessment Prompt Instructions</title><p>The quality assessment used a two-tier approach (<xref ref-type="other" rid="box1">Textbox 1</xref>). For the attributes &#x201C;Free from Hallucination&#x201D; and &#x201C;Free from Bias&#x201D; workflow 2 was designated as the reference standard because it represents physician-authored, physician-corrected documentation &#x2014; the physician dictated into Dragon Medical One, reviewed the output in real time, and manually corrected errors before finalizing the note. Notes from Workflows 1, 3, and 4 were scored by comparison against the corresponding Workflow 2 note for the same patient. For these two attributes, Workflow 2 notes automatically received a score of 5/5. For the remaining eight PDQI-9 attributes (Accuracy, Thoroughness, Usefulness, Organization, Comprehensibility, Succinctness, Synthesis, Internal Consistency), each note was evaluated independently without an external reference standard. The LLMs&#x2019; assessment of these attributes therefore reflects perceived internal plausibility and linguistic quality rather than verification against clinical ground truth.</p><boxed-text id="box1"><title> Quality assessment instructions.</title><p>&#x201C;In this project acts as a professional medical documentation reviewer, assessing medical notes based on the Modified Physician Documentation Quality Instrument (PDQI-9). It focuses on the following sections: Diagnose, Nebendiagnose, Anamnese, Befund, and Beurteilung. You will not provide medical advice or diagnoses. You will strictly focus on assessing the quality of the documentation.</p><p><bold>Assessment Attributes:</bold></p><p>You will rate the notes on the following ten attributes:</p><list list-type="order"><list-item><p>Accurate</p></list-item><list-item><p>Thorough</p></list-item><list-item><p>Useful</p></list-item><list-item><p>Organized</p></list-item><list-item><p>Comprehensible</p></list-item><list-item><p>Succinct</p></list-item><list-item><p>Synthesized</p></list-item><list-item><p>Internally Consistent</p></list-item><list-item><p>Free from Hallucination</p></list-item><list-item><p>Free from Bias</p></list-item></list><p><bold>Scoring System:</bold></p><p>Each attribute will be scored on a scale from 1 to 5 after in-depth analysis of the note.</p><list list-type="endash"><list-item><p>Score 1: Terrible, the note does not fulfill this attribute.</p></list-item><list-item><p>Score 5: Excellent, the note fully meets this attribute with a perfect score.</p></list-item></list><p><bold>Workflow-Specific Scoring:</bold></p><list list-type="endash"><list-item><p>For notes generated in Workflow 2 the attributes **Free from Hallucination** and **Free from Bias** will automatically receive a score of 5/5.</p></list-item><list-item><p>For notes generated in Workflow 1, 3 and Workflow 4, these attributes will be assessed through content comparison to the notes from Workflow 2 for the same patient.</p></list-item></list><p><bold>Tone and Interaction:</bold></p><p>The tone will remain professional, maintaining a formal and respectful interaction.</p><p><bold>Use Case:</bold></p><p>This project is designed for use in a research to assess 48 medical notes created across 4 workflows for 12 test patients.</p><p><bold>Output Format:</bold></p><p>Results will be presented in a table format suitable for statistical analysis, with an additional summary score for each assessed note.&#x201D;</p><p>Note: The above instructions are reproduced verbatim as deployed. They contain grammatical errors (eg, missing subject in &#x201C;In this project acts as.&#x201C;; &#x201C;for use in a research&#x201D; instead of &#x201C;for use in research&#x201D;) because they were originally authored as system instructions for an OpenAI Custom GPT platform, where the agent subject is implicit. These errors are acknowledged as a limitation; prompt phrasing can influence LLM behavior, and the grammatical imprecision may have introduced additional variance into the scoring process.</p></boxed-text></sec><sec id="s2-9"><title>Ethical Considerations</title><p>The study used only simulated patient data to avoid privacy concerns and comply with data protection regulations (eg, GDPR) [<xref ref-type="bibr" rid="ref9">9</xref>]. We evaluated the AI tools (including cloud-based transcription and note-generation services) in a simulated environment to ensure feasibility while upholding data privacy standards.</p></sec><sec id="s2-10"><title>Statistical Analysis</title><p>The main outcome of the study is efficacy of the workflows, expressed in the total time the physician needs to invest in the report standardized by the consultation time. This standardized total time consists of dictation and correction time. The efficacy was compared between workflows and native language of the physician. Given the small sample, nonparametric statistical models were used. We use the so-called F1_LD_F1 model described by Brunner &#x0026; Langer [<xref ref-type="bibr" rid="ref18">18</xref>] for the outcome time the physician spends on the report. This test is a nonparametric test with one within-patient factor (ie, workflow), and one between-patient factor (ie, physician language). The test statistic used is the ANOVA-type statistic (ATS), which provides asymptotically valid <italic>P</italic> values without relying on parametric assumptions, and would correspond to a parametric mixed-effects ANOVA. We are interested in the interaction effect of workflow and physician language on the time it takes to produce the report, as well as the two main effects of workflow and physician language. This method is implemented in the R package <italic>nparLD2</italic> [<xref ref-type="bibr" rid="ref19">19</xref>]. We report the unadjusted <italic>P</italic> values for the interaction effect and main effects. In case of a significant interaction effect, we continued with pairwise comparisons of the times for the workflows using Dunn&#x2019;s test of multiple comparisons with Bonferroni correction to control for the family-wise error rate, and report the rank-biserial correlation <italic>rr</italic>r as an effect size for each pairwise comparison. We report the adjusted <italic>P</italic> values of the multiple comparisons. This analysis was repeated for the dictation and correction time separately.</p><p>For the secretary time, which was only available for one workflow, we compared native and non-native speakers using the Wilcoxon rank-sum test.</p><p>Inter-rater reliability among the three LLMs was quantified using Krippendorff&#x2019;s &#x03B1;, calculated on the total score using an interval-scale metric, which accounts for the magnitude of differences between ratings. The coefficient ranges from &#x2013;1 (perfect systematic disagreement) to 1 (perfect agreement), with 0 indicating agreement no better than chance. Confidence intervals were derived via bootstrap resampling with 1000 iterations.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Time Efficiency</title><p>We assessed the documentation time outcomes for each workflow in each physician group (native vs non-native German speaker) (<xref ref-type="fig" rid="figure1">Figures 1</xref> and <xref ref-type="fig" rid="figure2">2</xref>, <xref ref-type="table" rid="table2">Table 2</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Physician&#x2019;s median time (in seconds) to finalize the report stratified by workflow and native language. Bars show the interquartile range (25th to 75th percentile), and different colors indicate workflow&#x2013;language combinations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77351_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Physician&#x2019;s median relative time to finalize the report (absolute time needed (sec) divided by the consultation time (sec)) stratified by workflow and the physician&#x2019;s native language. Bars show the interquartile range (25th to 75th percentile), and different colors indicate workflow&#x2013;language combinations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77351_fig02.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Median and interquartile range [25% percentile, 75% percentile] of total documentation time for each workflow by physician. The median [Q1,&#x202F;Q3] physician times (seconds) required to finalize a note.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Workflow</td><td align="left" valign="top">Native speaker</td><td align="left" valign="top">Non-native speaker</td></tr></thead><tbody><tr><td align="left" valign="top">Workflow 1: Traditional Dictation</td><td align="left" valign="top">160 [146,&#x202F;207]</td><td align="left" valign="top">310 [285,&#x202F;347]</td></tr><tr><td align="left" valign="top">Workflow 2: Dragon Medical One Software</td><td align="left" valign="top">294 [238,&#x202F;389]</td><td align="left" valign="top">586 [462,&#x202F;632]</td></tr><tr><td align="left" valign="top">Workflow 3: Whisper V3 and GPT Agent</td><td align="left" valign="top">148 [145,&#x202F;197]</td><td align="left" valign="top">287 [256,&#x202F;321]</td></tr><tr><td align="left" valign="top">Workflow 4: AI-Assisted Ambient Dictation</td><td align="left" valign="top">99 [85,&#x202F;114]</td><td align="left" valign="top">84 [77,&#x202F;93]</td></tr></tbody></table></table-wrap><p>Across the two physicians, the nonparametric ANOVA indicated a significant interaction between workflow and physician language (<italic>P</italic>=.001) and a strong main effect of workflow on documentation time (<italic>P</italic>&#x003C;.001), with no significant main effect of physician language (<italic>P</italic>=.44) (<xref ref-type="fig" rid="figure3">Figure 3</xref>). In post-hoc comparisons for the native German-speaking physician, workflow 4 (ambient AI dictation) was significantly faster than workflow 2 (speech recognition software; adjusted <italic>P</italic>&#x003C;.001), while no other pairwise differences were significant. For the non-native speaker, workflow 4 was also faster than workflow 2 (adjusted <italic>P</italic>&#x003C;.001) but did not differ significantly from workflow 1 (traditional dictation; adjusted <italic>P</italic>=.08). Workflow 4 could save 62.9% of time (IQR 56.25, 77.88) for a native speaker, compared to a saving of 83.65% (IQR 80.52, 88.65) for the non-native speaker (<xref ref-type="fig" rid="figure4">Figure 4</xref>). No other between-workflow differences reached significance for the non-native speaker.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Physician&#x2019;s median relative dictation time (absolute dictation time (sec) divided by the consultation time (sec)) stratified by workflow and the physician&#x2019;s native language. Bars show the interquartile range (25th to 75th percentile), and different colors indicate workflow&#x2013;language combinations.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77351_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Physician&#x2019;s median relative editing/correction time (absolute editing/correction time (sec) divided by the consultation time (sec)) stratified by workflow and the physician&#x2019;s native language. Bars show the interquartile range (25th to 75th percentile), and different colors indicate workflow&#x2013;language combinations. In Workflow 1, physician correction time was minimal but non-zero (median 76s for non-native; 45.5s for native).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77351_fig04.png"/></fig></sec><sec id="s3-2"><title>Practical time savings</title><p>Using workflow medians, WF4 reduced active documentation time by &#x2248;3.3 minutes versus WF2 for the native physician (294s vs 99s) and by &#x2248;8.4 minutes for the non-native physician (586s vs 84s).</p></sec><sec id="s3-3"><title>Documentation Quality</title><p>LLM evaluators assigned high absolute PDQI-9 scores across all workflows (overall mean 46.7/50, SD 0.4); however, these scores must be interpreted with caution given the poor inter-rater reliability reported below. Notably, for the eight attributes assessed without an external reference standard &#x2014; including &#x201C;Accuracy&#x201D; and &#x201C;Thoroughness&#x201D; &#x2014; the LLM scores represent perceived plausibility and internal coherence rather than verified factual correctness, as no ground-truth transcript of the simulated encounters was available to the evaluators. Workflow 4 had the highest mean quality score (47.3, SD 1.0), followed by Workflow 1 (46.7, SD 1.2), Workflow 2 (46.3, SD 1.5), and Workflow 3 (46.3, SD 1.5). The largest absolute difference in mean scores between any two workflows was approximately 1 point.</p></sec><sec id="s3-4"><title>Quality Criteria Breakdown</title><p>On each PDQI-9 quality criterion, all workflows performed similarly. For example, <italic>Accuracy</italic> was rated 5.0 (out of 5) for Workflows 1, 2, and 4, and 4.7 for Workflow 3. <italic>Thoroughness</italic> was scored 4.3 for Workflows 1&#x2010;3 and 4.7 for Workflow 4. Other criteria such as <italic>Usefulness</italic>, <italic>Organization</italic>, <italic>Synthesis, Internal Consistency</italic>, <italic>Lack of Hallucination</italic>, and <italic>Lack of Bias</italic> all had mean scores between 4.7 and 5.0 for every workflow. The <italic>Comprehensibility</italic> score was 4.5 for all workflows, and <italic>Succinctness</italic> was 4.0 for all. Criterion-level scores were similar across workflows, though the reliability of these scores is limited by the systematic disagreement among LLM evaluators (see <xref ref-type="fig" rid="figure5">Figure 5</xref> for a radar chart of criteria scores).</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Decomposition of the relative time the of the different workflows into components dictation time, correction time, and secretary time. Secretary time reflects a nonphysician resource; it represents the time that the secretary needs to type and format the medical note from the dictated transcript; it is visualized only to contextualize task shifting across workflows and is not counted toward the primary physician-time outcome.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77351_fig05.png"/></fig><p>Detailed scores for each criterion are presented in radar plots 1&#x2010;4 (<xref ref-type="fig" rid="figure6">Figure 6</xref>).</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Radar chart of documentation quality criteria (PDQI-9,Physician Documentation Quality Instrument (9-item adaptation) ) scores for each workflow.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77351_fig06.png"/></fig></sec><sec id="s3-5"><title>LLM Scoring Details</title><p>The three AI evaluators (LLMs) showed broadly consistent scoring patterns across workflows. <italic>Anthropic Claude 3</italic> assigned total quality scores ranging from 45 to 48 out of 50, with Workflow 4 receiving the highest score among those. <italic>OpenAI o1-preview</italic> tended to give slightly higher scores overall&#x2014;Workflows 3 and 4 each attained a perfect total score of 50 with this model. <italic>GPT-4</italic> also scored all workflows highly, though it gave Workflows 3 and 4 slightly lower total scores (44) than Workflows 1 and 2 (46). Despite minor model-to-model variations, all LLMs confirmed that Workflow 4&#x2019;s notes were of quality comparable to or slightly better than those of the other workflows, reinforcing the finding of no major quality degradation with AI assistance. Agreement between the three LLMs on the total score (interval scale) was assessed using Krippendorff&#x2019;s &#x03B1;. The coefficient was &#x2013;0.433 (95% CI &#x2212;0.444 to &#x2212;0.416), indicating systematic disagreement beyond what would be expected by chance. This negative value reflects that the LLMs not only failed to agree, but tended to assign divergent scores to the same cases.</p></sec><sec id="s3-6"><title>Summary of Results</title><p>In summary, <italic>Workflow 4</italic> (ambient AI dictation) had the shortest documentation time overall with LLM-assigned quality scores comparable to the other workflows, though the reliability of these scores is limited. Its efficiency advantage was most pronounced in comparison to Workflow 2 (speech recognition), which had the longest documentation times.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Interpretation of Findings</title><p>Our study confirmed that the ambient AI dictation workflow (Workflow 4) substantially reduced active documentation time for both physicians compared to the other methods, particularly versus the speech recognition workflow (Workflow 2). The time savings of Workflow 4 over traditional dictation (Workflow 1) did not reach statistical significance for the non-native physician (<italic>P</italic>=.08), indicating the study was underpowered to confirm this specific hypothesis. LLM evaluators assigned uniformly high PDQI-9 scores across all workflows. However, the negative Krippendorff&#x2019;s &#x03B1; indicates systematic disagreement among the evaluators, meaning these absolute scores cannot be taken as reliable evidence of documentation quality. The quality assessment should be considered exploratory rather than definitive. It should be noted that scores for attributes such as &#x201C;Accuracy&#x201D; and &#x201C;Thoroughness&#x201D; reflect perceived plausibility rather than verified factual correctness, as the LLM evaluators had no access to the clinical events of the encounter (<xref ref-type="fig" rid="figure7">Figure 7</xref>).</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Workflows diagram. Schematic of the four documentation workflows, showing where speech-to-text and AI (artificial intelligence) formatting occur.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77351_fig07.png"/></fig></sec><sec id="s4-2"><title>Clinical Importance of Differences</title><p>There is no established threshold for what constitutes a clinically important difference in PDQI scores in this context. In our series, observed differences were small and&#x2014;critically&#x2014;errors were easy to recognize and correct during brief review. A practical advantage of the AI workflows is that drafts are available immediately after the encounter, enabling same-moment correction, whereas secretarial pathways may introduce delays before physician review.</p></sec><sec id="s4-3"><title>Comparison With Prior Studies</title><p>Our findings on time efficiency align with prior research [<xref ref-type="bibr" rid="ref20">20</xref>]. Independent benchmarking likewise reports near&#x2013;real-time note generation yet nonzero error rates and sensitivity to multi-speaker/noisy conditions [<xref ref-type="bibr" rid="ref7">7</xref>]. Quiroz et al [<xref ref-type="bibr" rid="ref21">21</xref>] observed that AI-based digital scribe systems significantly reduced clinicians&#x2019; documentation time, supporting the idea that AI can automate much of the note-taking process. Similarly, Khalessi et al [<xref ref-type="bibr" rid="ref22">22</xref>] reported a 51% decrease in consultation time when using an AI-powered assistant (OSLER), illustrating the streamlining of workflows when speech-to-text is integrated. These parallels in the literature reinforce that the efficiency gains we observed are achievable with AI assistance in documentation.</p><p>Our results regarding note quality are also consistent with other studies [<xref ref-type="bibr" rid="ref20">20</xref>]. Bossen and Pine [<xref ref-type="bibr" rid="ref23">23</xref>] described an AI &#x201C;helper&#x201D; that improved documentation accuracy and consistency when paired with human oversight [<xref ref-type="bibr" rid="ref23">23</xref>]. In our study, physicians similarly reviewed and corrected AI-generated notes (Workflow 4), echoing the importance of human&#x2013;AI collaboration to maintain reliability. Vogel et al [<xref ref-type="bibr" rid="ref24">24</xref>] likewise found that speech recognition technology improved documentation speed and quality, which aligns with the performance of our Workflow 3 (Whisper+GPT) in achieving faster note completion than traditional dictation. Taken together, these studies underscore that integrating advanced speech-to-text and AI generation tools can enhance efficiency without compromising quality, in agreement with our findings.</p><p>Furthermore, the role of AI in multilingual health care environments (like our Swiss setting) remains underexplored. Recent work by Kalra and Seitzinger [<xref ref-type="bibr" rid="ref25">25</xref>] suggests that AI can help bridge communication gaps and reduce errors in linguistically diverse settings. Our observation that the non-native speaker benefited most in terms of time savings supports this notion. It indicates that AI documentation tools may be especially valuable where language barriers would otherwise make documentation more cumbersome.</p><p>Overall, our results contribute to the growing body of evidence that AI-assisted documentation can improve efficiency without compromising quality. We also extend this literature by evaluating AI&#x2019;s impact in a multilingual health care environment, demonstrating the potential for AI to alleviate documentation burdens in linguistically diverse settings.</p></sec><sec id="s4-4"><title>Practical Implications</title><p>The results of this study carry several practical implications. First, adopting AI-assisted documentation tools can substantially improve efficiency and reduce physicians&#x2019; administrative workload, which in turn may help lower the risk of physician burnout. By streamlining documentation, physicians can devote more time to direct patient care, potentially improving patient satisfaction and care quality. This benefit is especially pertinent in multilingual health care environments, where linguistic diversity complicates communication and documentation. In such settings, AI tools that proficiently handle multiple languages and dialects can bridge communication gaps and ensure more complete and accurate documentation across language barriers. Because performance differs across vendor implementations, institutions should validate candidate systems locally before rollout [<xref ref-type="bibr" rid="ref7">7</xref>]. Our results also illustrate that not all IT-based documentation solutions reduce physicians&#x2019; workload: the speech-recognition workflow (Workflow 2) removes secretary time but <italic>increases</italic> physicians&#x2019; active editing time compared with traditional dictation to a secretary, effectively shifting rather than eliminating documentation work (<xref ref-type="fig" rid="figure5">Figure 5</xref>).</p><p>The design of our study&#x2014;with simulated patients spanning multiple dialects and languages&#x2014;also strengthens its real-world relevance. By mirroring the linguistic diversity that physicians face in Swiss health care, we were able to test AI documentation tools under realistic multilingual conditions. Notably, the AI systems maintained robust performance across this linguistic variability, indicating that such tools could be effective in actual diverse clinical environments. At the same time, our observations highlight that variations in patient language (different dialects or accents) may impact the efficiency and accuracy of speech recognition. Future research should investigate how specific dialects or accents affect AI performance and whether developing speech recognition models tailored to regional dialects can further enhance documentation outcomes.</p><p>Furthermore, the significant time savings observed, especially for the non-native German-speaking physician, suggest that AI-assisted documentation can level the playing field for clinicians working in non-native languages. This could lead to increased job satisfaction, reduced stress, and better retention of health care professionals in linguistically diverse settings.</p></sec><sec id="s4-5"><title>Implementation Considerations (Not Measured)</title><p>We did not collect data on costs, infrastructure dependencies, or training/onboarding effort; these domains will materially influence real-world value and should be quantified in implementation studies.</p></sec><sec id="s4-6"><title>Limitations</title><p>Despite our encouraging findings, several limitations must be considered.</p></sec><sec id="s4-7"><title>Use of Synthetic Data</title><p>Our study relied on synthetic (simulated) patient encounters rather than real patient interactions. Simulated patients were residents/interns familiar with the scenarios. This likely produced more structured, interruption-free speech than typical clinical visits and could favor transcription and summarization. This decision was driven by privacy concerns&#x2014;especially for Workflow 4, which involved recording entire appointments in the cloud&#x2014;but it means that the controlled simulation may not capture all the complexity and unpredictability of actual clinical settings. Real encounters often include overlapping speakers, disfluencies, background noise, and non-linear narratives; these conditions may degrade performance relative to what we observed here. As a result, the generalizability of our results to real-world practice needs careful interpretation.</p></sec><sec id="s4-8"><title>Generalizability Beyond Surgical Specialties</title><p>Our setting (Plastic &#x0026; Hand Surgery) features relatively structured encounters. In more narrative-heavy domains (eg, internal medicine, neurology, psychiatry), speech characteristics and note structure differ, and AI advantages&#x2014;and error profiles&#x2014;may shift. Results should be extrapolated cautiously to those fields.</p></sec><sec id="s4-9"><title>Speech-to-Text Optimization</title><p>Workflow&#x202F;2 (Dragon Medical One&#x00AE;) and our AI-assisted workflows (Whisper Large v3 in Workflows&#x202F;3&#x2010;4) were <italic>used without training/fine-tuning or site-specific optimization</italic>. This likely <italic>underestimates</italic> the performance achievable with personalization (eg, specialty lexicons, microphone standardization) and domain-specific model tuning.</p></sec><sec id="s4-10"><title>Quality Assessment by AI</title><p>Our method of quality assessment has inherent limitations arising from the absence of independent clinical expert review. In routine practice, the ideal judge of documentation quality is the treating or referring physician with direct patient knowledge. In our study, we lacked such human clinical evaluators and instead used three different LLMs to score note quality. While using multiple AI reviewers helped minimize individual bias, it cannot replicate the nuanced clinical judgment of a physician. For the two attributes scored against Workflow 2 (&#x201C;Free from Hallucination&#x201D; and &#x201C;Free from Bias&#x201D;), the physician-approved note served as a pragmatic reference standard; however, residual speech-recognition errors that escaped physician correction could propagate as false ground truth, potentially penalizing other workflows for deviating from an error rather than introducing one. This design also creates circularity, as the reference standard is itself one of the comparators. For the remaining eight PDQI-9 attributes, the LLMs evaluated each note without any external reference, meaning scores for &#x201C;Accuracy&#x201D; and &#x201C;Thoroughness&#x201D; reflect perceived plausibility rather than verified factual correctness. Furthermore, the lack of variance in accuracy scores suggests a ceiling effect or lack of sensitivity in the LLM scoring tool. Future studies should incorporate independent clinical expert review to provide a true ground-truth assessment.</p></sec><sec id="s4-11"><title>Small Sample Size</title><p>The small sample size (two physicians and 12 simulated encounters) limits the diversity of clinical scenarios and clinician behaviors represented. Although a sample of twelve is adequate for pilot studies aimed at estimating variability, our results should be considered exploratory and not generalizable [<xref ref-type="bibr" rid="ref26">26</xref>]. We were not powered for dialect/language subgroup analyses; actor-case assignments were not tracked to enable such stratification post hoc. Additionally, we did not systematically record actor-case assignments regarding dialects, which introduces an uncontrolled confounding variable. Critically, because only one physician represented each language group, the between-group factor (native vs non-native) is confounded with individual physician characteristics such as typing speed, system familiarity, and personal documentation style. Observed differences attributed to language background may therefore reflect individual traits rather than a true language effect, constituting a form of pseudoreplication. Future studies should include multiple physicians per language group to disentangle individual variability from language-related effects.</p></sec><sec id="s4-12"><title>Participant and Software Familiarity</title><p>The participating physicians&#x2019; familiarity with the cases and comfort with technology, as well a personal knowledge of &#x201C;acting residents&#x201D; may have positively influenced performance, and individual factors (like typing speed or prior experience with AI tools) could have impacted efficiency outcomes. Thus, our findings may not fully represent the wider physician population or more varied clinical environments. The timekeeper and participants were not blinded to workflows, which may introduce Hawthorne effects.</p></sec><sec id="s4-13"><title>Future Directions</title><p>To build on our findings, future studies should address the noted limitations. Real patient encounters&#x2014;conducted with strict ethical oversight and data protection&#x2014;are a priority for validating AI-assisted documentation in practice. In such trials, involving institutional review boards early and collaborating with data privacy officers will help ensure compliance with all regulations. It will also be important to include evaluations by independent clinicians (eg, referring physicians or external experts) to obtain clinically grounded assessments of note quality, rather than relying solely on AI or internal review.</p><p>Further research should also broaden the participant pool and case diversity. Studies that enroll more physicians across different specialties, experience levels, and linguistic backgrounds (and include a wider range of patient scenarios) will enhance the generalizability of results and reveal how AI documentation tools perform across various settings and user groups.</p><p>From a technological standpoint, exploring solutions that minimize data security risks is essential. On-premises AI systems or cloud services with robust compliance certifications could be tested to alleviate privacy concerns associated with cloud processing of sensitive patient data. Developing or fine-tuning AI models to run within a hospital&#x2019;s secure IT infrastructure may allow institutions to harness AI benefits while maintaining full control over patient information.</p><p>Lastly, future investigations should examine the long-term, system-level effects of AI-assisted documentation. Key metrics could include patient throughput (eg, wait times or appointment lengths), physician satisfaction and well-being, and overall health care delivery efficiency. By tracking patient outcomes and workflow metrics over time, researchers can determine whether the immediate efficiency gains we observed translate into meaningful improvements in care and provider experience in the long run.</p></sec><sec id="s4-14"><title>Conclusion</title><p>In conclusion, AI-assisted documentation&#x2014;particularly ambient AI dictation&#x2014;demonstrated significant time savings without compromising documentation quality in a multilingual Swiss setting. These preliminary findings support further development and evaluation of AI scribes to alleviate documentation burden and improve equity for non-native speakers. Robust validation with larger samples, real patient encounters, human evaluators and secure data-processing frameworks is essential before routine deployment.</p></sec></sec></body><back><ack><p>Generative AI systems were used to (1) draft initial simulation referral letters/case vignettes that were reviewed and edited by a clinician and (2) score documentation quality (PDQI&#x2011;9) using three LLMs for this proof&#x2011;of&#x2011;concept. No generative system was used to fabricate or alter study data; all outputs were reviewed by physicians.</p></ack><notes><sec><title>Funding</title><p>We acknowledge financial support by a Booster Research grant of theresearch council of Cantonal Hospital Aarau (1410.000.240). The funding covered thestatistical analysis and will also cover the article processing charges. Acknowledgments: We thank the Clinical Trial Unit (CTU) Basel for statistical support and the residents and interns who served as standardized patients.</p></sec><sec><title>Data Availability</title><p>All data generated or analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: MG, JAP</p><p>Methodology: MG, NR</p><p>Investigation: MG, FF, FB</p><p>Formal analysis: NR</p><p>Resources: JAP, PG</p><p>Supervision: JAP, PG</p><p>Writing&#x2014;original draft: MG, JAP</p><p>Writing&#x2014;review &#x0026; editing: FB, FF, JAP, MG, NR, PG</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb3">GDPR</term><def><p>General Data Protection Regulation</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">PDQI-9</term><def><p>Physician Documentation Quality Instrument (9-item adaptation)</p></def></def-item><def-item><term id="abb6">RME</term><def><p>relative marginal effect</p></def></def-item><def-item><term id="abb7">WF1&#x2013;WF4</term><def><p>documentation workflows 1&#x2010;4</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sinsky</surname><given-names>C</given-names> </name><name name-style="western"><surname>Colligan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Allocation of physician time in ambulatory practice: a time and motion study in 4 Specialties</article-title><source>Ann Intern Med</source><year>2016</year><month>12</month><day>6</day><volume>165</volume><issue>11</issue><fpage>753</fpage><lpage>760</lpage><pub-id pub-id-type="doi">10.7326/M16-0961</pub-id><pub-id pub-id-type="medline">27595430</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>West</surname><given-names>CP</given-names> </name><name name-style="western"><surname>Dyrbye</surname><given-names>LN</given-names> </name><name name-style="western"><surname>Shanafelt</surname><given-names>TD</given-names> </name></person-group><article-title>Physician burnout: contributors, consequences and solutions</article-title><source>J Intern Med</source><year>2018</year><month>06</month><volume>283</volume><issue>6</issue><fpage>516</fpage><lpage>529</lpage><pub-id pub-id-type="doi">10.1111/joim.12752</pub-id><pub-id pub-id-type="medline">29505159</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bischoff</surname><given-names>A</given-names> </name></person-group><article-title>Measuring quality and patient satisfaction in healthcare communication with foreign-language speakers</article-title><source>LANS-TTS</source><year>2021</year><volume>5</volume><pub-id pub-id-type="doi">10.52034/lanstts.v5i.159</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Durham</surname><given-names>M</given-names> </name></person-group><article-title>Language choice on a Swiss mailing list</article-title><source>J Comput Mediat Commun</source><year>2006</year><volume>9</volume><issue>1</issue><fpage>0</fpage><lpage>0</lpage><pub-id pub-id-type="doi">10.1111/j.1083-6101.2003.tb00359.x</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kraft</surname><given-names>E</given-names> </name><name name-style="western"><surname>Hostettler</surname><given-names>S</given-names> </name></person-group><article-title>FMH-&#x00C4;rztestatistik 2023 &#x2013; 40% ausl&#x00E4;ndische &#x00C4;rztinnen und &#x00C4;rzte</article-title><source>Schweizerische &#x00C4;rztezeitung</source><year>2024</year><access-date>2026-05-20</access-date><volume>105</volume><issue>12</issue><fpage>32</fpage><lpage>36</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.fmh.ch/files/pdf30/1377245206-de-fmh---aerztestatistik_0001-1.pdf">https://www.fmh.ch/files/pdf30/1377245206-de-fmh---aerztestatistik_0001-1.pdf</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Rivera</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Calvert</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Denniston</surname><given-names>AK</given-names> </name><collab>SPIRIT-AI and CONSORT-AI Working Group</collab></person-group><article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI Extension</article-title><source>BMJ</source><year>2020</year><month>09</month><day>9</day><volume>370</volume><fpage>m3164</fpage><pub-id pub-id-type="doi">10.1136/bmj.m3164</pub-id><pub-id pub-id-type="medline">32909959</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ha</surname><given-names>E</given-names> </name><name name-style="western"><surname>Choon-Kon-Yune</surname><given-names>I</given-names> </name><name name-style="western"><surname>Murray</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Evaluating the usability, technical performance, and accuracy of artificial intelligence scribes for primary care: competitive analysis</article-title><source>JMIR Hum Factors</source><year>2025</year><month>07</month><day>23</day><volume>12</volume><fpage>e71434</fpage><pub-id pub-id-type="doi">10.2196/71434</pub-id><pub-id pub-id-type="medline">40700466</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tierney</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Gayre</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hoberman</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Ambient artificial intelligence scribes to alleviate the burden of clinical documentation</article-title><source>NEJM Catalyst</source><year>2024</year><month>02</month><day>21</day><volume>5</volume><issue>3</issue><pub-id pub-id-type="doi">10.1056/CAT.23.0404</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biro</surname><given-names>J</given-names> </name><name name-style="western"><surname>Handley</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Cobb</surname><given-names>NK</given-names> </name><etal/></person-group><article-title>Accuracy and safety of AI-enabled scribe technology: instrument validation study</article-title><source>J Med Internet Res</source><year>2025</year><month>01</month><day>27</day><volume>27</volume><fpage>e64993</fpage><pub-id pub-id-type="doi">10.2196/64993</pub-id><pub-id pub-id-type="medline">39869899</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meszaros</surname><given-names>J</given-names> </name><name name-style="western"><surname>Minari</surname><given-names>J</given-names> </name><name name-style="western"><surname>Huys</surname><given-names>I</given-names> </name></person-group><article-title>The future regulation of artificial intelligence systems in healthcare services and medical research in the European Union</article-title><source>Front Genet</source><year>2022</year><volume>13</volume><fpage>927721</fpage><pub-id pub-id-type="doi">10.3389/fgene.2022.927721</pub-id><pub-id pub-id-type="medline">36267404</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Voigt</surname><given-names>P</given-names> </name><name name-style="western"><surname>von dem Bussche</surname><given-names>A</given-names> </name></person-group><article-title>The EU general data protection regulation (GDPR)</article-title><year>2017</year><access-date>2026-06-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://gdpr-info.eu/">https://gdpr-info.eu/</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehrabi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Morstatter</surname><given-names>F</given-names> </name><name name-style="western"><surname>Saxena</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lerman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Galstyan</surname><given-names>A</given-names> </name></person-group><article-title>A survey on bias and fairness in machine learning</article-title><source>ACM Comput Surv</source><year>2022</year><month>07</month><day>31</day><volume>54</volume><issue>6</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3457607</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Gladysz</surname><given-names>M</given-names> </name></person-group><article-title>MediBrief creator</article-title><source>OpenAI Custom GPT</source><year>2024</year><access-date>2026-06-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://chatgpt.com/g/g-ftj0oDFFy-medibrief-creator">https://chatgpt.com/g/g-ftj0oDFFy-medibrief-creator</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stetson</surname><given-names>PD</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wrenn</surname><given-names>JO</given-names> </name><name name-style="western"><surname>Siegler</surname><given-names>EL</given-names> </name></person-group><article-title>Assessing electronic note quality using the Physician Documentation Quality Instrument (PDQI-9)</article-title><source>Appl Clin Inform</source><year>2012</year><volume>3</volume><issue>2</issue><fpage>164</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.4338/aci-2011-11-ra-0070</pub-id><pub-id pub-id-type="medline">22577483</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>Anthropic</article-title><source>Claude 3 Opus</source><year>2024</year><access-date>2024-08-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/claude-3-family">https://www.anthropic.com/news/claude-3-family</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>OpenAI</article-title><source>o1-preview</source><year>2024</year><access-date>2024-08-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/introducing-openai-o1-preview/">https://openai.com/index/introducing-openai-o1-preview/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>OpenAI</article-title><source>GPT-4o</source><year>2024</year><access-date>2024-08-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o/">https://openai.com/index/hello-gpt-4o/</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brunner</surname><given-names>E</given-names> </name><name name-style="western"><surname>Langer</surname><given-names>F</given-names> </name></person-group><article-title>Nonparametric analysis of ordered categorical data in designs with longitudinal observations and small sample sizes</article-title><source>Biometric Journal</source><year>2000</year><month>10</month><volume>42</volume><issue>6</issue><fpage>663</fpage><lpage>675</lpage><pub-id pub-id-type="doi">10.1002/1521-4036(200010)42:6&#x003C;663::AID-BIMJ663&#x003E;3.0.CO;2-7</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noguchi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gel</surname><given-names>YR</given-names> </name><name name-style="western"><surname>Brunner</surname><given-names>E</given-names> </name><name name-style="western"><surname>Konietschke</surname><given-names>F</given-names> </name></person-group><article-title>NparLD: an R software package for the nonparametric analysis of longitudinal data in factorial experiments</article-title><source>J Stat Softw</source><year>2012</year><month>09</month><day>18</day><volume>50</volume><issue>12</issue><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.18637/jss.v050.i12</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Buchem</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Kant</surname><given-names>IMJ</given-names> </name><name name-style="western"><surname>King</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kazmaier</surname><given-names>J</given-names> </name><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Bauer</surname><given-names>MP</given-names> </name></person-group><article-title>Impact of a digital scribe system on clinical documentation time and quality: usability study</article-title><source>JMIR AI</source><year>2024</year><month>09</month><day>23</day><volume>3</volume><fpage>e60020</fpage><pub-id pub-id-type="doi">10.2196/60020</pub-id><pub-id pub-id-type="medline">39312397</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quiroz</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Laranjo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kocaballi</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Berkovsky</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rezazadegan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Coiera</surname><given-names>E</given-names> </name></person-group><article-title>Challenges of developing a digital scribe to reduce clinical documentation burden</article-title><source>NPJ Digit Med</source><year>2019</year><volume>2</volume><issue>1</issue><fpage>114</fpage><pub-id pub-id-type="doi">10.1038/s41746-019-0190-1</pub-id><pub-id pub-id-type="medline">31799422</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khalessi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Nuredini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kaur</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pimenta</surname><given-names>D</given-names> </name></person-group><article-title>2260&#x2005;AI-assisted electronic heath record use for clinical consultations amongst unfamiliar users: a feasibility study</article-title><source>Emerg Med J</source><year>2023</year><month>11</month><volume>40</volume><issue>887</issue><pub-id pub-id-type="doi">10.1136/emj-2023-RCEM.45</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bossen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pine</surname><given-names>KH</given-names> </name></person-group><article-title>Batman and Robin in healthcare knowledge work: human-ai collaboration by clinical documentation integrity specialists</article-title><source>ACM Trans Comput-Hum Interact</source><year>2023</year><month>04</month><day>30</day><volume>30</volume><issue>2</issue><fpage>1</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1145/3569892</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vogel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaisers</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wassmuth</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mayatepek</surname><given-names>E</given-names> </name></person-group><article-title>Analysis of documentation speed using web-based medical speech recognition technology: randomized controlled trial</article-title><source>J Med Internet Res</source><year>2015</year><month>11</month><day>3</day><volume>17</volume><issue>11</issue><fpage>e247</fpage><pub-id pub-id-type="doi">10.2196/jmir.5072</pub-id><pub-id pub-id-type="medline">26531850</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kalra</surname><given-names>J</given-names> </name><name name-style="western"><surname>Seitzinger</surname><given-names>P</given-names> </name></person-group><source>Expanding Our Grasp: Artificial Intelligence as the Next Leap Forward in Healthcare Quality</source><year>2023</year><publisher-name>Healthcare and Medical Devices</publisher-name><pub-id pub-id-type="doi">10.54941/ahfe1003467</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moore</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Carter</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Nietert</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Stewart</surname><given-names>PW</given-names> </name></person-group><article-title>Recommendations for planning pilot studies in clinical and translational research</article-title><source>Clin Transl Sci</source><year>2011</year><month>10</month><volume>4</volume><issue>5</issue><fpage>332</fpage><lpage>337</lpage><pub-id pub-id-type="doi">10.1111/j.1752-8062.2011.00347.x</pub-id><pub-id pub-id-type="medline">22029804</pub-id></nlm-citation></ref></ref-list></back></article>