<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e75262</article-id><article-id pub-id-type="doi">10.2196/75262</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Development and Evaluation of a Retrieval-Augmented Generation Chatbot for Orthopedic and Trauma Surgery Patient Education: Mixed-Methods Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Baur</surname><given-names>David</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ansorg</surname><given-names>J&#x00F6;rg</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Heyde</surname><given-names>Christoph-Eckhard</given-names></name><degrees>Prof Dr Med</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Voelker</surname><given-names>Anna</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department for Orthopedics, Trauma and Plastic Surgery, University Hospital Leipzig</institution><addr-line>Liebigstra&#x00DF;e 20</addr-line><addr-line>Leipzig</addr-line><addr-line>Saxony</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Professional Association for Orthopaedics and Trauma Surgery (BVOU)</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Nasution</surname><given-names>Arbi Haza</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Thies</surname><given-names>Bill</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lemke</surname><given-names>Tristan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Anna Voelker, MD, PhD, Department for Orthopedics, Trauma and Plastic Surgery, University Hospital Leipzig, Liebigstra&#x00DF;e 20, Leipzig, Saxony, 04103, Germany, 49 3419723000, 49 3419723009; <email>anna.voelker@medizin.uni-leipzig.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>23</day><month>10</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e75262</elocation-id><history><date date-type="received"><day>31</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>27</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>27</day><month>09</month><year>2025</year></date></history><copyright-statement>&#x00A9; David Baur, J&#x00F6;rg Ansorg, Christoph-Eckhard Heyde, Anna Voelker. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 23.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e75262"/><abstract><sec><title>Background</title><p>Large language models are increasingly applied in health care for documentation, patient education, and clinical decision support. However, their factual reliability can be compromised by hallucinations and a lack of source traceability. Retrieval-augmented generation (RAG) enhances response accuracy by combining generative models with document retrieval mechanisms. While promising in medical contexts, RAG-based systems remain underexplored in orthopedic and trauma surgery patient education, particularly in non-English settings.</p></sec><sec><title>Objective</title><p>This study aimed to develop and evaluate a RAG-based chatbot that provides German-language, evidence-based information on common orthopedic conditions. We assessed the system&#x2019;s performance in terms of response accuracy, contextual precision, and alignment with retrieved sources. In addition, we examined user satisfaction, usability, and perceived trustworthiness.</p></sec><sec sec-type="methods"><title>Methods</title><p>The chatbot integrated OpenAI&#x2019;s GPT language model with a Qdrant vector database for semantic search. Its corpus consisted of 899 curated German-language documents, including national orthopedic guidelines and patient education content from the Orthinform platform of the German Society of Orthopedics and Trauma Surgery. After preprocessing, the data were segmented into 18,197 retrievable chunks. Evaluation occurred in two phases: (1) human validation by 30 participants (orthopedic specialists, medical students, and nonmedical users), who rated 12 standardized chatbot responses using a 5-point Likert scale, and (2) automated evaluation of 100 synthetic queries using the Retrieval-Augmented Generation Assessment Scale, measuring answer relevancy, contextual precision, and faithfulness. A permanent disclaimer indicated that the chatbot provides general information only and is not intended for diagnosis or treatment decisions.</p></sec><sec sec-type="results"><title>Results</title><p>Human ratings indicated high perceived quality for accuracy (mean 4.55, SD 0.45), helpfulness (mean 4.61, SD 0.57), ease of use (mean 4.90, SD 0.30), and clarity (mean 4.77, SD 0.43), while trust scored slightly lower (mean 4.23, SD 0.56). Retrieval-Augmented Generation Assessment Scale evaluation confirmed strong technical performance for answer relevancy (mean 0.864, SD 0.223), contextual precision (mean 0.891, SD 0.201), and faithfulness (mean 0.853, SD 0.171). Performance was highest for knee and back-related topics and lower for hip-related queries (eg, gluteal tendinopathy), which showed elevated error rates in differential diagnosis.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The chatbot demonstrated strong performance in delivering orthopedic patient education through an RAG framework. Its deployment on the national Orthinform platform has led to more than 9500 real-world user interactions, supporting its relevance and acceptance. Future improvements should focus on expanding domain coverage, enhancing retrieval precision, and integrating multimodal content and advanced RAG techniques to improve robustness and safety in patient-facing apps.</p></sec></abstract><kwd-group><kwd>retrieval-augmented generation</kwd><kwd>RAG</kwd><kwd>orthopedic patient education</kwd><kwd>medical chatbots</kwd><kwd>artificial intelligence in healthcare</kwd><kwd>large language models</kwd><kwd>LLM</kwd><kwd>clinical decision support systems</kwd><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>health information retrieval</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In recent years, machine learning, particularly through large language models (LLMs), has significantly influenced various industries, with profound implications for health care. These advanced models have transitioned from experimental frameworks to practical tools that enhance patient care and medical research. By analyzing complex, large-scale datasets, LLMs enable health care professionals to extract insights related to patient data, disease trends, and treatment efficacy [<xref ref-type="bibr" rid="ref1">1</xref>]. Their expanding role includes apps in disease diagnosis, medical documentation, and patient-provider communication, improving the efficiency and quality of health care services.</p><p>A key advancement in LLMs is their ability to process and interpret extensive contextual information, a crucial feature in health care, where context profoundly affects diagnostic and therapeutic decisions. For instance, OpenAI&#x2019;s GPT series has demonstrated improved proficiency in comprehending and generating complex medical dialogs and texts, supporting apps such as automated medical documentation and patient communication [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>In diagnostics, LLMs assist in extracting and summarizing essential information from unstructured medical data, including clinical notes and imaging reports, facilitating timely and accurate condition assessments [<xref ref-type="bibr" rid="ref3">3</xref>]. In addition, these models contribute to medical education by providing interactive learning platforms and access to vast medical literature, enhancing the training and preparedness of medical students and professionals [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Patient engagement is another critical area where LLMs are used. By delivering clear and informative explanations about medical conditions and treatments, these models enhance patient understanding and adherence to treatment plans [<xref ref-type="bibr" rid="ref5">5</xref>]. However, their deployment in health care requires caution due to the potential for generating misleading or incorrect information, known as &#x201C;hallucinations.&#x201D; Such errors can result in misinformed decisions that may compromise patient safety [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>To mitigate these risks, enhancing the reliability of LLM-generated information is essential. Techniques such as retrieval-augmented generation (RAG), which integrates the generative capabilities of LLMs with data retrieval from trusted sources, have been developed to improve accuracy and verifiability, thereby preserving the integrity of medical advice [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Providing patient information on symptoms and conditions in orthopedics and trauma surgery presents unique challenges, as treatment success often depends on patients&#x2019; understanding of their condition and adherence to therapeutic measures [<xref ref-type="bibr" rid="ref8">8</xref>]. Traditional methods, including printed materials, online education, and brief personal consultations, have limitations in addressing the comprehensive informational needs of orthopedic patients [<xref ref-type="bibr" rid="ref9">9</xref>]. Digital solutions are gaining traction for their ability to provide accessible, personalized, and on-demand information, with specialized orthopedic apps demonstrating improved patient outcomes through disease education, tailored rehabilitation programs, and real-time feedback [<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Despite growing interest in RAG apps in health care, limited empirical research has evaluated such systems specifically within orthopedic settings involving both medical professionals and patients [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. This study seeks to address this gap by developing and evaluating a RAG-enhanced chatbot designed to provide patient-centered information on orthopedic symptoms and conditions.</p><p>By integrating multiple evaluation approaches, this study contributes to the emerging literature on specialized RAG apps in health care, with implications for improving patient education in orthopedics and trauma surgery. The primary objective is to develop and evaluate a RAG chatbot that delivers evidence-based information on orthopedic conditions while comprehensively assessing its performance through feedback from both medical professionals and patients, as well as automated metrics.&#x2003;</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dataset and Preprocessing</title><p>Our dataset comprised official guidelines, specialized medical literature, and curated articles licensed from the Berufsverband f&#x00FC;r Orthop&#x00E4;die und Unfallchirurgie (BVOU-Germany), including texts available on the Orthinform platform [<xref ref-type="bibr" rid="ref12">12</xref>]. These materials encompassed expert-reviewed medical content and patient education resources, all in German.</p><p>To enable contextualized medical information retrieval, we developed an RAG framework using LangChain and OpenAI models. This framework ensures that generated responses are derived from relevant, authoritative sources, supporting an evidence-based and transparent methodology [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. A preprocessing pipeline was implemented to normalize and structure the dataset for efficient retrieval. JSON documents were standardized by extracting and refining metadata, including document names, categories, page numbers, URLs, and timestamps. PDFs were processed using unstructured.partition.pdf, preserving document structure and hierarchical elements. PyMuPDF was used as a fallback when unstructured failed to extract content correctly. To improve text quality, regex-based normalization removed artifacts.</p><p>The text was segmented using RecursiveCharacterTextSplitter, with 1000-character chunks and 200-character overlaps. URLs embedded in the text were preserved, and HTML tags were intentionally retained to maintain the structural integrity of web-based content. To enhance traceability, each chunk was assigned a unique identifier generated via MD5 hashing, and the processed data were stored in JSONL format, optimized for retrieval-based workflows.</p></sec><sec id="s2-2"><title>Vectorization and Document Retrieval</title><p>To enable semantic search and efficient retrieval, we implemented Qdrant as a vector database and indexed documents using OpenAI&#x2019;s text-embedding-3-large model. This facilitated high-dimensional vector representations, ensuring precise retrieval of semantically relevant medical texts.</p><p>The Qdrant collection was initialized with a 3072-dimensional vector space, using cosine similarity as the distance metric. Processed text chunks were embedded in batches of 50 to optimize performance. The resulting vectors, along with their associated metadata, were stored in Qdrant and efficiently upserted for retrieval [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s2-3"><title>Chatbot Implementation and Query Processing</title><p>A Streamlit-based chat interface was developed to enable interactive, real-time retrieval of medical information. The system integrates OpenAI for response generation and Qdrant for document retrieval, supporting a conversational RAG framework. The retrieval mechanism is history-aware, dynamically adapting to previous interactions and retrieving up to 5 relevant document chunks per query to ensure contextually rich responses.</p><p>The retrieval pipeline operates with a predefined search prompt, optimizing precision in medical document retrieval to ensure alignment with evidence-based orthopedic and trauma surgery guidelines. Retrieved documents are then passed to a response generation chain, conditioning GPT-4o to structure answers based on medical system instructions and ongoing conversation context. While the underlying GPT-4o model supports more than 50 languages, the chatbot&#x2019;s responses are primarily grounded in a curated German-language knowledge base (BVOU guidelines, patient education content). Therefore, while multilingual queries are technically possible, factual accuracy is optimized for German.</p><p>To ensure that all chatbot responses remain strictly grounded in the retrieved medical context and to prevent hallucinations, we implemented a structured system prompt written in German. This prompt explicitly instructs GPT-4o to generate answers exclusively based on the retrieved documents. If no relevant information is found in the retrieved content, the model is required to return a predefined message stating that sufficient information is not available. The system prompt includes the instruction:</p><disp-quote><p>Beantworte die folgende Frage basierend ausschlie&#x00DF;lich auf den Dir vorgelegten Kontext. Wenn du in dem Kontext der Dir vorgelegt wird keine Antwort auf die Frage findest, sag dem User: &#x2018;Dazu liegen mir leider keine Informationen vor.&#x2019; Du darfst keine Empfehlungen geben, keine Diagnosen stellen, keine Inhalte erfinden oder spekulieren. Formuliere sachlich, klar und laienverst&#x00E4;ndlich. Verwende nicht mehr als 250 W&#x00F6;rter, au&#x00DF;er der User fordert ausdr&#x00FC;cklich mehr. Verwende ausschlie&#x00DF;lich den folgenden Kontext: {context}</p></disp-quote><p>The English translation of the above instruction is given below for clarity:</p><disp-quote><p>Answer the following question based strictly on the provided context. If the context does not contain sufficient information to answer the question, respond to the user: &#x2018;Unfortunately, I do not have any information on that.&#x2019; You are not allowed to make recommendations, provide diagnoses, invent content, or speculate. Be objective, clear, and use layperson-friendly language. Do not exceed 250 words unless explicitly requested. Use only the following context: {context}.</p></disp-quote><p>The placeholder {context} is dynamically replaced at runtime with the top-k (k=5) most relevant document chunks retrieved from the Qdrant vector database. These context segments are semantically matched to the user query and serve as the exclusive knowledge base from which the response is generated. This prompt configuration ensures that the model does not rely on its pretrained general knowledge or generate information beyond the curated orthopedic corpus, thereby minimizing the risk of hallucinated or unverifiable content.</p><p>To support manual validation and optimization, a debug interface was implemented to inspect retrieved documents, evaluate chunk segmentation, and analyze token consumption. A system prompt was designed to strictly limit response length (250 words, up to 1000 if explicitly requested) and ensure that the chatbot provides only factually grounded answers. The search prompt dynamically reformulates queries based on conversation history to enhance retrieval accuracy.</p><p>The chatbot streams artificial intelligence (AI)-generated responses dynamically, maintains a structured session history, and displays extracted URLs from retrieved documents to ensure source transparency and clinical accountability for users.</p></sec><sec id="s2-4"><title>User Study and Evaluation</title><p>To evaluate the chatbot&#x2019;s clinical reliability and user acceptability, we conducted a structured user study via Google Forms, allowing participants to interact with the system and provide qualitative and quantitative feedback. Participants were instructed to input predefined orthopedic queries into the chatbot, covering topics such as herniated discs, hip osteoarthritis, anterior cruciate ligament injuries, and congenital muscular torticollis.</p><p>A total of 12 predefined orthopedic questions were developed by 2 independent board-certified orthopedic specialists to represent common clinical inquiries for each topic. The responses were rated using two key metrics on a 5-point Likert scale: (1) response accuracy measures how precisely the chatbot provided medically valid answers, and (2) helpfulness assesses how effectively the chatbot assisted users in understanding medical conditions and treatments.</p><p>For each question, mean (SD) values were calculated to quantify the consistency and reliability of ratings. Likert-scale data were treated as interval-level data for descriptive statistical analysis, which we considered acceptable due to the scale&#x2019;s symmetric design and widespread use of mean values in user experience (UX) research.</p><p>Beyond response evaluation, the survey included structured sections on empathy, clarity, usability, and response time. Users rated the chatbot&#x2019;s friendliness, ability to recognize intent, trustworthiness, and clarity of medical explanations. In addition, participants assessed navigation, response latency, and overall satisfaction. At the end of the survey, users were given the opportunity to provide open-ended feedback for further improvements. The test group included a mix of medical professionals (students, residents, and specialists) and nonmedical users to reflect diverse perspectives, but no subgroup analysis was conducted.</p></sec><sec id="s2-5"><title>Automated Evaluation Using Retrieval-Augmented Generation Assessment Scale</title><p>To supplement the user-based evaluation, we applied the Retrieval-Augmented Generation Assessment Scale (RAGAS) framework, focusing on three key automated performance metrics [<xref ref-type="bibr" rid="ref16">16</xref>]: (1) answer relevancy measures how well the chatbot&#x2019;s response aligns with the user query; (2) context precision assesses the quality and medical relevance of retrieved documents; and (3) faithfulness ensures that responses are grounded solely in retrieved medical contexts, preventing hallucinations.</p><p>We generated 100 synthetic test questions using GPT-4o, ensuring each question was grounded in retrieved medical contexts. Corresponding ground truth answers were synthesized and manually reviewed by 2 board-certified orthopedic and trauma surgeons to ensure clinical accuracy and guideline adherence. Any ambiguous or incorrect responses were corrected to align with evidence-based medicine and current best practices. The 100-question test set was processed through the RAG pipeline, retrieving documents via Qdrant and generating responses using GPT-4o. The RAGAS framework then scored each response against ground truth answers using the 3 core evaluation metrics. Evaluation was conducted in batches, and performance metrics were aggregated into a quantitative evaluation report. Heatmaps and statistical summaries were generated to visualize chatbot performance across key medical retrieval tasks. This approach ensured that:</p><list list-type="order"><list-item><p>Retrieved contexts were clinically relevant (context precision).</p></list-item><list-item><p>Responses remained factually grounded in retrieved medical literature (faithfulness).</p></list-item><list-item><p>Answers directly addressed the medical queries posed by users (answer relevancy).</p></list-item></list><p>By combining user-based evaluations with structured, automated performance assessments, we systematically validated the chatbot&#x2019;s ability to retrieve, interpret, and generate medically accurate orthopedic and trauma surgery information.</p></sec><sec id="s2-6"><title>Synthetic Query Generation for RAGAS Evaluation</title><p>For the automated RAGAS evaluation, 100 synthetic test questions and corresponding ground truth answers were generated using GPT-4o, based exclusively on the curated orthopedic knowledge base. This process ensured that all test questions were clinically relevant, context-grounded, and reproducible <xref ref-type="other" rid="box1">Textboxes 1</xref> and <xref ref-type="other" rid="box2">2</xref>.</p><boxed-text id="box1"><title> Prompt for question generation.</title><p>Based on this medical context:</p><p>[context content]</p><p>Generate a specific and challenging patient question on the topic [topic] ([subtopic]) that:</p><list list-type="order"><list-item><p>Can be answered using the information from this context</p></list-item><list-item><p>Tests understanding of medical concepts</p></list-item><list-item><p>Requires precise information from the text</p></list-item><list-item><p>Could realistically be asked by a patient</p></list-item></list><p>The question should sound natural and specific.</p></boxed-text><boxed-text id="box2"><title> Prompt for ground truth answer generation.</title><p>Based on this medical context:</p><p>[context content]</p><p>Answer this patient question:</p><p>[question]</p><p>The answer should:</p><list list-type="order"><list-item><p>Use only information from the provided context</p></list-item><list-item><p>Be precise and complete</p></list-item><list-item><p>Be formulated in patient-friendly language</p></list-item><list-item><p>Be 3&#x2010;5 sentences long</p></list-item></list></boxed-text></sec><sec id="s2-7"><title>Ethical Considerations</title><p>The study was conducted in accordance with the Declaration of Helsinki (World Medical Association; latest version) and applicable national regulations. It used an anonymous, voluntary online questionnaire and did not collect personal identifiers, IP addresses, or contact details. Under the Statutes of the Ethics Committee of the Medical Faculty, Leipzig University (&#x00A7;1(3)), formal review is required when personal data are processed; our protocol analyzed only anonymous responses. Based on this institutional policy and General Data Protection Regulation Recital 26 (anonymous data are not personal data), no formal ethics review was required at our institution. Participation implied consent after reading an online information sheet [<xref ref-type="bibr" rid="ref17">17</xref>, <xref ref-type="bibr" rid="ref18">18</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Preprocessing and Data Preparation</title><p>A total of 899 unique documents were processed, encompassing structured and unstructured medical content from sources such as Orthinform.de, German orthopedic guidelines, and additional curated literature. The dataset was segmented and preprocessed to enable efficient retrieval and response generation within the RAG framework.</p><p>Following the preprocessing pipeline, 18,197 unique text chunks were generated and stored in the Qdrant vector database, optimizing retrieval. These chunks were embedded using OpenAI&#x2019;s text-embedding-3-large model, facilitating high-dimensional semantic search across patient education materials. The dataset contained 5,565,958 tokens, with an average chunk size of 973.27 characters. The largest chunks measured 1000 characters, while the smallest segments were 143 characters, ensuring a structured yet comprehensive representation of the source materials.</p></sec><sec id="s3-2"><title>Human Validation</title><p>The user evaluation included 30 participants: 13 licensed physicians (43.3%), 3 resident physicians (10%), 7 medical students (23.3%), and 10 individuals without a medical background (33.3%). Participants rated chatbot responses to 12 predefined orthopedic queries using a 5-point Likert scale. The queries covered four main orthopedic conditions: a herniated disc, hip osteoarthritis (coxarthrosis), anterior cruciate ligament tear, and congenital muscular torticollis. For each condition, 3 specific subquestions addressed pathophysiology, symptoms, and treatment options. A homogeneous test group was not considered essential for the purposes of this study, as the primary focus was on evaluating UX, transparency, and response plausibility from different user perspectives. Including both medical and nonmedical participants allowed for a more realistic appraisal of trust, clarity, and usability in patient-facing contexts.</p></sec><sec id="s3-3"><title>Assessment of Disease-Specific Responses</title><p>The accuracy ratings of chatbot responses ranged from 4.41 to 4.65 (mean), while helpfulness ratings ranged from 4.48 to 4.74 (mean), <xref ref-type="table" rid="table1">Table 1</xref>. The highest accuracy rating (mean 4.65, SD 0.61) was recorded for the question on congenital muscular torticollis. The highest helpfulness rating (mean 4.74, SD 0.51) was observed for the question on hip osteoarthritis symptoms. SD values for accuracy ratings ranged from 0.49 to 0.81 and from 0.49 to 0.68 for helpfulness ratings. The highest variance in accuracy (SD 0.81) was measured for responses regarding herniated disc symptoms.</p><p>Across all responses, 93.2% received a rating of 4 or higher for accuracy and 95.8% for helpfulness. No systematically lower ratings were observed for treatment-related questions compared to pathophysiological or symptom-related queries.</p><p><xref ref-type="table" rid="table1">Table 1</xref> presents data from 30 users, summarizing the average ratings for the accuracy and helpfulness of chatbot responses to specific orthopedic questions. Values are reported as mean (SD). The ratings were collected using a 5-point Likert scale (1=very inaccurate and unhelpful, 5=very accurate and helpful). The data were collected during the evaluation phase in September 2024.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Human validation of disease-specific questions from the chatbot.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Questions</td><td align="left" valign="bottom">How accurate was the information? mean (SD)</td><td align="left" valign="bottom">How helpful were the answers? mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Herniated disc</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What exactly is a herniated disc?</td><td align="left" valign="top">4.45 (0.72)</td><td align="left" valign="top">4.58 (0.56)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What are the typical symptoms of a herniated disc?</td><td align="left" valign="top">4.41 (0.81)</td><td align="left" valign="top">4.60 (0.49)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What treatment options are available for a herniated disc and how do they differ?</td><td align="left" valign="top">4.52 (0.68)</td><td align="left" valign="top">4.48 (0.68)</td></tr><tr><td align="left" valign="top">Hip osteoarthritis</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What is coxarthrosis and how does this condition develop?</td><td align="left" valign="top">4.55 (0.57)</td><td align="left" valign="top">4.58 (0.56)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What symptoms are typical of hip osteoarthritis?</td><td align="left" valign="top">4.55 (0.62)</td><td align="left" valign="top">4.74 (0.51)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What treatments are available for coxarthrosis?</td><td align="left" valign="top">4.55 (0.67)</td><td align="left" valign="top">4.65 (0.61)</td></tr><tr><td align="left" valign="top">Anterior cruciate ligament tear</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What is an anterior cruciate ligament tear and how can it occur?</td><td align="left" valign="top">4.54 (0.62)</td><td align="left" valign="top">4.61 (0.56)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What symptoms indicate an anterior cruciate ligament tear?</td><td align="left" valign="top">4.61 (0.62)</td><td align="left" valign="top">4.68 (0.65)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What treatment options are available for an anterior cruciate ligament tear?</td><td align="left" valign="top">4.52 (0.68)</td><td align="left" valign="top">4.52 (0.63)</td></tr><tr><td align="left" valign="top">Congenital muscular torticollis</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What does congenital muscular torticollis mean and how can it be recognized in a baby?</td><td align="left" valign="top">4.65 (0.61)</td><td align="left" valign="top">4.61 (0.56)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What are the causes of congenital muscular torticollis?</td><td align="left" valign="top">4.61 (0.56)</td><td align="left" valign="top">4.65 (0.49)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>What treatment options are available for babies with congenital muscular torticollis?</td><td align="left" valign="top">4.61 (0.49)</td><td align="left" valign="top">4.61 (0.56)</td></tr></tbody></table></table-wrap></sec><sec id="s3-4"><title>User Experience</title><p>UX metrics were evaluated across 8 dimensions, as visualized in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Ease of use and navigation intuitiveness received the highest ratings, both with mean values of 4.90 (SD 0.30). Response clarity (mean 4.77, SD 0.43) and understanding of user concerns (mean 4.77, SD 0.43) were also rated positively. Trust in the provided information was rated moderately lower (mean 4.23, SD 0.56). System response time received the lowest rating (mean 3.74, SD 0.73) and exhibited the highest variance. This may partly reflect limitations of the test environment: during the user evaluation, the chatbot was accessed via a local Streamlit app, which lacked the server-side performance optimization of the final production system. In contrast, the deployed version on the Orthinform platform demonstrates faster and more stable response times in real-world use. Overall satisfaction with the chatbot was high (mean 4.71, SD 0.53).</p><p>UX evaluation of the orthopedic chatbot during the user study (September 2024). The radar chart summarizes ratings across 8 dimensions (ease of use, navigation, response clarity, understanding of user concerns, trust, empathy, system response time, and overall satisfaction). Data were collected from 30 participants (licensed physicians, medical students, and nonmedical users) using a 5-point Likert scale (1=very poor and 5=excellent).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>User experience evaluation of the orthopedic chatbot.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e75262_fig01.png"/></fig></sec><sec id="s3-5"><title>Automated Evaluation Using RAGAS</title><p>The 100-question synthetic test set was processed through the chatbot&#x2019;s retrieval pipeline, retrieving up to 5 (k=5) document chunks per query and generating responses using GPT-4o. RAGAS evaluation was conducted on 3 core metrics, with results reported as mean (SD) scores to quantify both performance and variability across the test set.</p><sec id="s3-5-1"><title>Answer Relevancy</title><p>The chatbot&#x2019;s ability to generate responses directly aligned with user queries achieved a mean score of 0.864 ( SD 0.223). Queries related to back pain and shoulder problems scored the highest, with means of 0.917 and 0.882, respectively, while hip-related topics exhibited slightly lower performance, with a mean of 0.784 (SD 0.338).</p></sec><sec id="s3-5-2"><title>Context Precision</title><p>The retrieval mechanism returned highly relevant documents per query, achieving a mean precision score of 0.891 (SD 0.201). The highest precision was observed for knee-related queries, with a mean of 0.959 (SD 0.078), while hip-related topics showed lower retrieval performance, with a mean of 0.809 (SD 0.362).</p></sec><sec id="s3-5-3"><title>Faithfulness</title><p>Responses remained strongly grounded in retrieved documents, with a mean faithfulness score of 0.853 (SD 0.171). A total of 71% of responses were fully based on the retrieved context (scoring above 0.8), while 19% displayed minor inconsistencies or additional inferred information (scoring between 0.6 and 0.8).</p><p>As shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>, the chatbot demonstrates strong performance across all orthopedic domains, with most metrics scoring above 0.8. Notable performance variations were observed across different orthopedic topics. Knee-related queries showed the highest overall context precision score (0.959), indicating exceptionally accurate retrieval of relevant information. Back pain queries achieved the highest answer relevance (0.917), suggesting the system excels at generating targeted responses for this common musculoskeletal complaint.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Retrieval-Augmented Generation Assessment Scale evaluation metrics across major orthopedic domains.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e75262_fig02.png"/></fig><p>Hip-related queries, while still performing adequately, showed comparatively lower scores across all 3 dimensions, particularly in answer relevancy (0.784). This indicates an area for potential improvement in the knowledge base or response generation for hip-related orthopedic conditions.</p><p>RAGAS evaluation metrics across major orthopedic domains (automated evaluation, October 2024). The chart visualizes answer relevancy, context precision, and faithfulness for 5 major orthopedic domains (back pain, hip, knee, shoulder, and foot problems) based on a synthetic test set of 100 GPT-generated queries. Values range from 0 to 1, with a color gradient from red (poor performance) to dark green (excellent performance). Overall mean (SD) values for each metric are displayed at the bottom of the chart.</p></sec></sec><sec id="s3-6"><title>Performance Analysis and Observations</title><p>The subanalysis of the RAGAS evaluation of 100 synthetic test questions across 31 subcategories of major orthopedic conditions reevaluated the chatbot&#x2019;s performance using 3 key metrics: answer relevancy, context precision, and faithfulness. A comprehensive analysis demonstrates that 86% of all queries attained values above 0.8 in both answer relevancy and context precision, while 92% of responses exhibited answer relevancy values above 0.8. Among the 6% of queries with answer relevancy values below 0.1, 60% pertained to the hip domain (primarily tendinitis), 25% to foot complaints, and the remaining 15% to various other conditions.</p><p><xref ref-type="fig" rid="figure3">Figure 3</xref> illustrates performance metrics across all 31 orthopedic conditions examined. Several subtopics reached the maximum score (1) in individual metrics. In context precision, this applied to toe arthritis, hip dysplasia, knee instability, shoulder bursitis, and shoulder impingement syndrome. Knee osteoarthritis achieved the maximum value of 1 in faithfulness, combined with high answer relevancy (0.936) and maximum context precision (1). In contrast, hip tendinitis exhibited the lowest answer relevancy value of 0.298 in the entire dataset.</p><p>RAGAS performance metrics across 31 orthopedic subtopics (automated evaluation, October 2024). The chart visualizes answer relevancy, context precision, and faithfulness across 31 specific orthopedic subcategories, grouped by their main domains. Values range from 0 to 1 and are displayed using a color gradient from red (poor performance) to dark green (excellent performance). Overall, mean (SD) values for each metric are provided at the bottom of the chart.</p><p>A detailed analysis of individual questions accounts for the performance deficits in hip tendinitis, as illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Out of the 3 queries on this topic, 2 (Q19: differential diagnosis of tendinitis; Q20: clinical distinguishing features) received answer relevancy values of 0, while the third query (Q18: therapeutic options) achieved 0.893. This uneven distribution resulted in a low average value of 0.298.</p><p>Overall, 3% of the 100 test questions exhibited total context retrieval failures (score=0). Hip-related topics were disproportionately affected, with complete failures in 15% of answer relevancy assessments and 20% of context precision assessments. These failures primarily occurred in complex differentiation tasks, notably Q19 (hip tendinopathy) and Q6 (hip impingement syndrome).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Retrieval-Augmented Generation Assessment Scale performance metrics across 31 orthopedic subtopics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e75262_fig03.png"/></fig></sec><sec id="s3-7"><title>Qualitative Output Analysis and Sample Responses</title><p>To illustrate the output quality of our system, we selected representative user queries along with the corresponding retrieved passages, generated answers, and associated evaluation metrics (answer relevancy, context precision, and faithfulness). These examples reflect typical dialog scenarios and demonstrate both the content quality and the traceability of source references. The selected samples cover a range of anatomical regions (eg, lumbar spine, knee joint, and shoulder) and include questions on diagnostics, conservative management, and surgical decision-making.</p><p><xref ref-type="table" rid="table2">Table 2</xref> presents 2 representative examples of user queries, retrieved context excerpts, and corresponding RAGAS evaluation scores from the automated test set. The user queries and 2 sample answers have been translated from German into English, while other context excerpts remain in the original language to ensure fidelity. All examples are based on the synthetic evaluation conducted since October 2024. A complete list of all original queries, answers, and evaluation metrics is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app2">2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Representative examples of user queries, retrieved context excerpts, and Retrieval-Augmented Generation Assessment Scale evaluation scores.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">User query</td><td align="left" valign="bottom">Retrieved context (excerpt)</td><td align="left" valign="bottom">Answer relevancy</td><td align="left" valign="bottom">Context precision</td><td align="left" valign="bottom">Faithfulness</td></tr></thead><tbody><tr><td align="left" valign="top">What conservative treatments are typically used for a lumbar disc herniation, and under what circumstances is surgery unavoidable?</td><td align="left" valign="top">Conservative treatments include physiotherapy, medication, targeted spinal injections, weight reduction, and avoiding prolonged sitting. Surgery is considered if symptoms persist for more than 6 months despite therapy or in cases of neurological deficits or loss of bladder/bowel control.</td><td align="left" valign="top">0.974</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">What is the most common location for cartilage damage in the knee joint detected in MRI<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> examinations with a field strength of 3.5 Tesla, and how do the results of these examinations, together with pain localization and intensity, influence the selection of therapy?</td><td align="left" valign="top">In MRI examinations of the knee joint with a field strength of 3.5 Tesla, the most common location for detecting cartilage damage is the medial femoral condyle, followed by the posterior surface of the patella, although the latter is much less common.<break/>The results of these MRI examinations, in conjunction with the patient&#x2019;s clinical examination which includes pain localization and intensity, play a crucial role in influencing the selection of a therapy. While clinical tests rarely provide direct evidence of cartilage damage, the information gathered from them, alongside MRI and X-ray diagnostics, helps determine the appropriate treatment method. The key factors for selecting the therapy are the depth and extent of the cartilage defect that needs to be treated.</td><td align="left" valign="top">0.9702</td><td align="left" valign="top">0.833</td><td align="left" valign="top">1</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>MRI: magnetic resonance imaging. </p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>The evaluation of our RAG chatbot for orthopedic and trauma surgery patient education demonstrated strong performance metrics. Human assessments yielded accuracy ratings averaging 4.55 (out of 5) and helpfulness ratings of 4.61 (out of 5) across orthopedic conditions. UX metrics showed high scores for ease of use (4.90) and response clarity (4.77).</p></sec><sec id="s4-2"><title>Comparison With Prior Works</title><p>Compared to previous studies on health care chatbots, our system demonstrates significantly higher performance in terms of accuracy, helpfulness, and user satisfaction. For instance, Nadarzynski et al [<xref ref-type="bibr" rid="ref19">19</xref>] reported acceptance rates of 67% and trust scores averaging 3.4 out of 5 in a general population sample, reflecting persistent concerns about reliability and factual correctness. Similarly, Milne-Ives et al [<xref ref-type="bibr" rid="ref20">20</xref>] observed usability scores typically ranging between 3.5 and 4.2 in various chatbot apps, with considerable heterogeneity depending on system design and target audience. In contrast, our chatbot achieved mean scores of 4.55 for perceived accuracy and 4.61 for helpfulness, supported by strong ratings for usability and clarity. These differences may partly be explained by the academic background of our user cohort, but they also reflect domain-specific design choices and system robustness.</p><p>A notable strength of our study lies in the integration of human and automated evaluation strategies. Beyond structured user feedback, we applied the Retrieval-Augmented Generation Assessment Score (RAGAS) framework to evaluate the chatbot&#x2019;s contextual precision, answer relevancy, and grounding fidelity. The chatbot achieved strong mean scores across all 3 dimensions (eg, answer relevancy: mean 0.864, SD 0.223), indicating effective retrieval and factual consistency. The relatively high SD values reflect expected variability across subdomains, consistent with known topic-specific limitations in medical LLM performance.</p><p>In direct comparison to baseline LLMs without retrieval mechanisms, the advantages of RAG become evident. For example, Deng et al [<xref ref-type="bibr" rid="ref21">21</xref>] showed that ChatGPT-4 provided accurate responses to only 17% of treatment-related orthopedic questions, while 75% were deemed merely comprehensive without being verifiably correct. In contrast, our system achieved 92% of responses above 0.8 in answer relevancy, demonstrating substantial improvements in factual alignment and contextual precision. These results align with findings by Jabal et al [<xref ref-type="bibr" rid="ref22">22</xref>], who observed that retrieval-enhanced LLMs outperformed traditional models in structured data extraction tasks in clinical settings.</p><p>Beyond performance metrics, our system introduces several architectural and contextual innovations that distinguish it from existing chatbot solutions.</p><p>This domain-specific adaptation approach is comparable to methodologies applied in other specialized RAG contexts, such as religious text retrieval [<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Recent studies further support this pattern: a 2-layer RAG architecture improved grounding and answer quality in medical Q&#x0026;A, and RAG variants outperformed GPT-4 in clinical fact-checking, underscoring the value of retrieval for verifiable, source-linked outputs [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>Unlike general-purpose medical chatbots that often rely solely on pretrained LLMs, our framework combines semantic retrieval with domain-specific grounding. All responses are generated based on verified German-language materials, including guideline-compliant content licensed from the BVOU. This ensures high clinical relevance and alignment with local standards of care.</p><p>Furthermore, the system is explicitly tailored to German-speaking users and reflects orthopedics-specific terminology, diagnostic reasoning, and patient communication norms. Unlike many tools focused on English-speaking audiences, our chatbot addresses the information needs of a specific linguistic and clinical population. The RAG architecture also promotes transparency by displaying the source URLs and retrieved text segments for each response, enabling traceability and user trust. Finally, our validation approach integrates structured Likert-based user ratings with RAGAS metrics and expert-reviewed reference answers, providing a robust multiperspective evaluation framework.</p><p>In summary, the combination of retrieval-based grounding, language and domain specificity, and transparent validation distinguishes our chatbot from conventional medical AI tools and supports its use in clinically relevant patient education scenarios.</p></sec><sec id="s4-3"><title>Real-World Implementation</title><p>Following the evaluation phase, the chatbot was integrated into the public patient education platform of the German Society of Orthopedics and Trauma Surgery (BVOU) and has been accessible via Orthinform.de since October 2024. This real-world deployment enables patients across Germany to access AI-guided orthopedic information around the clock. The system is clearly labeled as an informational tool, not intended for diagnosis, treatment decisions, or appointment scheduling. It does not support direct communication with clinics.</p><p>As of June 2025, the chatbot has processed a total of 9514 user interactions. The most frequent topics, in descending order of frequency, included back pain, hip joint degeneration, meniscus tear, knee pain, spinal stenosis, shoulder impingement, and disc herniation. These usage statistics underscore the chatbot&#x2019;s practical relevance and indicate a high level of user acceptance for AI-assisted patient education in orthopedics.</p></sec><sec id="s4-4"><title>Limitations</title><p>Despite the overall strong performance, several limitations must be noted. First, hip-related queries showed disproportionately high failure rates, with 15% resulting in complete answer relevancy failures (score=0) and 20% exhibiting context precision failures. Questions involving complex differential diagnoses, such as distinguishing gluteal tendinopathy from trochanteric bursitis, proved particularly challenging for the retrieval system. This pattern of domain-specific performance variance, reflected in elevated SD values across metrics, suggests uneven coverage in the knowledge base or suboptimal document chunking in specialized orthopedic subdomains. These findings are consistent with observations by Johnson et al [<xref ref-type="bibr" rid="ref26">26</xref>], who reported similar retrieval limitations in RAG-based diagnostic systems due to gaps in knowledge coverage.</p><p>Second, while the system performed reliably on standard diagnostic and treatment questions, more nuanced or comparative queries, such as those related to treatment efficacy or surgical decision-making, exhibited greater variability in faithfulness scores. The observed mean value of 0.853 (SD 0.171) in faithfulness suggests inconsistent grounding across different query types. Genovese et al [<xref ref-type="bibr" rid="ref27">27</xref>] similarly reported that RAG model accuracy in patient education depends heavily on the depth and quality of the underlying dataset.</p><p>Third, there is a persistent risk of hallucinations. Although the retrieval architecture significantly reduces the likelihood of fabricated responses compared to standard LLMs, it cannot eliminate them entirely. The quality of generated outputs is directly tied to the relevance and completeness of the underlying documents. Underrepresented orthopedic subdomains may thus produce lower fidelity outputs. This aligns with concerns raised by Altofer et al [<xref ref-type="bibr" rid="ref28">28</xref>], who described generative AI as a &#x201C;double-edged sword&#x201D; in medicine&#x2014;capable of exceeding expert performance in some domains, yet also prone to generating misleading content.</p><p>Fourth, the current implementation omits several advanced RAG features that may enhance response quality. These include hybrid retrieval methods (combining sparse and dense search), contrastive answer generation, knowledge graph integration, and multimodal capabilities (eg, linking imaging findings with textual explanations). Future work should explore these approaches to strengthen context matching, improve factual precision, and reduce hallucination risks in complex query types.</p><p>Finally, a deliberately implemented safeguard in the form of a persistent disclaimer emphasizes that the chatbot serves solely for general informational purposes and is not intended for self-diagnosis or clinical decision-making. This ethical design feature underlines the system&#x2019;s patient-centered focus and risk awareness. Future versions may benefit from the integration of an autonomous detection mechanism to recognize when a query exceeds the system&#x2019;s informational boundaries and to proactively prompt users to consult a qualified health care professional.</p><p>Another promising enhancement is domain-specific knowledge organization. The observed regional performance variations, with knee-related queries outperforming hip-related ones, suggest that tailored knowledge representation strategies may be needed for different orthopedic subdomains. Introducing hierarchical retrieval mechanisms that model anatomical and pathological relationships could improve response accuracy for conditions requiring cross-document integration.</p></sec><sec id="s4-5"><title>Future Work</title><p>To address these limitations, advanced RAG mechanisms could further enhance the system&#x2019;s capabilities. Implementing agentic RAGs, which dynamically reformulate queries based on conversational context, could improve performance on complex differential diagnostic questions. Unlike static top-k retrieval, this approach enables iterative, context-aware information gathering, particularly benefiting challenging hip-related queries identified in our evaluation.</p><p>In future work, domain-specific knowledge organization should also be explored. The observed regional performance variations, with knee-related queries outperforming hip-related ones, suggest that tailored knowledge representation strategies may be needed for different orthopedic subdomains. Introducing hierarchical retrieval mechanisms that model anatomical and pathological relationships could improve response accuracy for conditions requiring cross-document integration.</p></sec><sec id="s4-6"><title>Conclusions</title><p>Our RAG-based chatbot demonstrates strong potential for improving patient access to orthopedic information, with solid evaluation scores and reliable performance. While it already provides valuable support for patient education, continuous adaptation is essential to keep pace with rapid AI advancements and ensure optimal accuracy. Our data show strong overall performance with identifiable outliers where the RAG pipeline is less robust. To ensure consistent quality in patient education and broader medical use cases, performance should be validated across additional datasets and settings with clear, reproducible quality control. The impact of RAG will hinge on high-quality, well-managed corpora and reliable retrieval that keeps answers traceable to sources.</p></sec></sec></body><back><ack><p>This project was developed in collaboration with the Professional Association for Orthopaedics and Trauma Surgery (Berufsverband f&#x00FC;r Orthop&#x00E4;die und Unfallchirurgie e.V. - BVOU). We acknowledge their financial support and valuable contributions in shaping the research, providing domain expertise, and facilitating the evaluation process. While BVOU provided financial support and domain expertise, they had no role in the study design, data collection, analysis, or decision to publish. This study was supported by the Open Access Publishing Fund of Leipzig University.</p></ack><notes><sec><title>Data Availability</title><p>The detailed performance metrics of the retrieval-augmented generation system are provided in the <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BVOU</term><def><p>Berufsverband f&#x00FC;r Orthop&#x00E4;die und Unfallchirurgie</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">RAG</term><def><p>retrieval-augmented generation</p></def></def-item><def-item><term id="abb5">RAGAS</term><def><p>Retrieval-Augmented Generation Assessment Scale</p></def></def-item><def-item><term id="abb6">UX</term><def><p>user experience</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajkomar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>I</given-names> </name></person-group><article-title>Machine learning in medicine</article-title><source>N Engl J Med</source><year>2019</year><month>04</month><day>4</day><volume>380</volume><issue>14</issue><fpage>1347</fpage><lpage>1358</lpage><pub-id pub-id-type="doi">10.1056/NEJMra1814259</pub-id><pub-id pub-id-type="medline">30943338</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johri</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>BA</given-names> </name><etal/></person-group><article-title>An evaluation framework for clinical use of large language models in patient interaction tasks</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>77</fpage><lpage>86</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03328-5</pub-id><pub-id pub-id-type="medline">39747685</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shoja</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Van de Ridder</surname><given-names>JMM</given-names> </name><name name-style="western"><surname>Rajput</surname><given-names>V</given-names> </name></person-group><article-title>The emerging role of generative artificial intelligence in medical education, research, and practice</article-title><source>Cureus</source><year>2023</year><month>06</month><volume>15</volume><issue>6</issue><fpage>e40883</fpage><pub-id pub-id-type="doi">10.7759/cureus.40883</pub-id><pub-id pub-id-type="medline">37492829</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>SV</given-names> </name></person-group><article-title>Accuracy, consistency, and hallucination of large language models when analyzing unstructured clinical notes in electronic medical records</article-title><source>JAMA Netw Open</source><year>2024</year><month>08</month><day>1</day><volume>7</volume><issue>8</issue><fpage>e2425953</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.25953</pub-id><pub-id pub-id-type="medline">39136951</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ning</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Keppo</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for generative artificial intelligence in health care</article-title><source>npj Health Syst</source><year>2025</year><volume>2</volume><issue>1</issue><pub-id pub-id-type="doi">10.1038/s44401-024-00004-1</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Majid</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Plummer</surname><given-names>V</given-names> </name></person-group><article-title>The effectiveness of orthopedic patient education in improving patient outcomes: a systematic review protocol</article-title><source>JBI Database System Rev Implement Rep</source><year>2015</year><month>01</month><volume>13</volume><issue>1</issue><fpage>122</fpage><lpage>133</lpage><pub-id pub-id-type="doi">10.11124/jbisrir-2015-1950</pub-id><pub-id pub-id-type="medline">26447013</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x00D3; Doinn</surname><given-names>T</given-names> </name><name name-style="western"><surname>Broderick</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Clarke</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hogan</surname><given-names>N</given-names> </name></person-group><article-title>Readability of patient educational materials in sports medicine</article-title><source>Orthop J Sports Med</source><year>2022</year><month>05</month><volume>10</volume><issue>5</issue><fpage>23259671221092356</fpage><pub-id pub-id-type="doi">10.1177/23259671221092356</pub-id><pub-id pub-id-type="medline">35547607</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Timmers</surname><given-names>T</given-names> </name><name name-style="western"><surname>Janssen</surname><given-names>L</given-names> </name><name name-style="western"><surname>van der Weegen</surname><given-names>W</given-names> </name><etal/></person-group><article-title>The effect of an app for day-to-day postoperative care education on patients with total knee replacement: randomized controlled trial</article-title><source>JMIR Mhealth Uhealth</source><year>2019</year><month>10</month><day>21</day><volume>7</volume><issue>10</issue><fpage>e15323</fpage><pub-id pub-id-type="doi">10.2196/15323</pub-id><pub-id pub-id-type="medline">31638594</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Owoyemi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Abubakar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Owoyemi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Open-source retrieval augmented generation framework for retrieving accurate medication insights from formularies for African healthcare workers</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 21, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.02.20.25322640</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><source>Orthinform</source><access-date>2025-10-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://orthinform.de/">https://orthinform.de/</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bora</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cuay&#x00E1;huitl</surname><given-names>H</given-names> </name></person-group><article-title>Systematic analysis of retrieval-augmented generation-based LLMs for medical chatbot applications</article-title><source>Mach Learn Knowl Extr</source><year>2024</year><volume>6</volume><issue>4</issue><fpage>2355</fpage><lpage>2374</lpage><pub-id pub-id-type="doi">10.3390/make6040116</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Markey</surname><given-names>N</given-names> </name><name name-style="western"><surname>El-Mansouri</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rensonnet</surname><given-names>G</given-names> </name><name name-style="western"><surname>van Langen</surname><given-names>C</given-names> </name><name name-style="western"><surname>Meier</surname><given-names>C</given-names> </name></person-group><article-title>From RAGs to riches: utilizing large language models to write documents for clinical trials</article-title><source>Clin Trials</source><year>2025</year><month>10</month><volume>22</volume><issue>5</issue><fpage>626</fpage><lpage>631</lpage><pub-id pub-id-type="doi">10.1177/17407745251320806</pub-id><pub-id pub-id-type="medline">40013826</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Monir</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>I</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>D</given-names> </name></person-group><article-title>VectorSearch: enhancing document retrieval with semantic embeddings and optimized search</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.17383</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Es</surname><given-names>S</given-names> </name><name name-style="western"><surname>James</surname><given-names>J</given-names> </name><name name-style="western"><surname>Espinosa Anke</surname><given-names>L</given-names> </name><name name-style="western"><surname>Schockaert</surname><given-names>S</given-names> </name></person-group><article-title>RAGAs: automated evaluation of retrieval augmented generation</article-title><conf-name>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations</conf-name><conf-date>Mar 17-22, 2024</conf-date><conf-loc>St. Julians, Malta</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.eacl-demo.16</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Statutes for the Ethics Advisory Board and the Ethics Commission forSecurity-Relevant Research at Leipzig Universityfrom 26 March 2025</article-title><source>University Hospital Leipzig</source><access-date>2025-10-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.uni-leipzig.de/fileadmin/ul/Dokumente/2025_Ethikbeirat_Ethics_Advisory_Board_Statutes_KEF_01.pdf">https://www.uni-leipzig.de/fileadmin/ul/Dokumente/2025_Ethikbeirat_Ethics_Advisory_Board_Statutes_KEF_01.pdf</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Regulation (EU) 2016/ 679 of the European Parliament and of the Council of 27 April 2016 on the protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing Directive 95/ 46/ EC (General Data Protection Regulation)</article-title><source>EUR-Lex</source><access-date>2025-10-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32016R0679&#x0026;utm_source">https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32016R0679&#x0026;utm_source</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nadarzynski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Miles</surname><given-names>O</given-names> </name><name name-style="western"><surname>Cowie</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ridge</surname><given-names>D</given-names> </name></person-group><article-title>Acceptability of artificial intelligence (AI)-led chatbot services in healthcare: a mixed-methods study</article-title><source>Digit Health</source><year>2019</year><volume>5</volume><fpage>2055207619871808</fpage><pub-id pub-id-type="doi">10.1177/2055207619871808</pub-id><pub-id pub-id-type="medline">31467682</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Milne-Ives</surname><given-names>M</given-names> </name><name name-style="western"><surname>de Cock</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>E</given-names> </name><etal/></person-group><article-title>The effectiveness of artificial intelligence conversational agents in health care: systematic review</article-title><source>J Med Internet Res</source><year>2020</year><month>10</month><day>22</day><volume>22</volume><issue>10</issue><fpage>e20346</fpage><pub-id pub-id-type="doi">10.2196/20346</pub-id><pub-id pub-id-type="medline">33090118</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Oosterhof</surname><given-names>JJ</given-names> </name><etal/></person-group><article-title>ChatGPT is a comprehensive education tool for patients with patellar tendinopathy, but it currently lacks accuracy and readability</article-title><source>Musculoskelet Sci Pract</source><year>2025</year><month>04</month><volume>76</volume><fpage>103275</fpage><pub-id pub-id-type="doi">10.1016/j.msksp.2025.103275</pub-id><pub-id pub-id-type="medline">39899928</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jabal</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Warman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Open-weight language models and retrieval-augmented generation for automated structured data extraction from diagnostic reports: assessment of approaches and parameters</article-title><source>Radiol Artif Intell</source><year>2025</year><month>05</month><volume>7</volume><issue>3</issue><fpage>e240551</fpage><pub-id pub-id-type="doi">10.1148/ryai.240551</pub-id><pub-id pub-id-type="medline">40072216</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khalila</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Nasution</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Monika</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Investigating retrieval-augmented generation in quranic studies: a study of 13 open-source large language models</article-title><source>Int J Adv Comput Sci Appl</source><year>2025</year><volume>16</volume><issue>2</issue><pub-id pub-id-type="doi">10.14569/IJACSA.2025.01602134</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Das</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ge</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Two-layer retrieval-augmented generation framework for low-resource medical question answering using reddit data: proof-of-concept study</article-title><source>J Med Internet Res</source><year>2025</year><month>01</month><day>6</day><volume>27</volume><fpage>e66220</fpage><pub-id pub-id-type="doi">10.2196/66220</pub-id><pub-id pub-id-type="medline">39761554</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>An</surname><given-names>R</given-names> </name></person-group><article-title>Use of retrieval-augmented large language model for COVID-19 fact-checking: development and usability study</article-title><source>J Med Internet Res</source><year>2025</year><month>04</month><day>30</day><volume>27</volume><fpage>e66098</fpage><pub-id pub-id-type="doi">10.2196/66098</pub-id><pub-id pub-id-type="medline">40306628</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Bande</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name></person-group><article-title>Retrieval augmented medical diagnosis system</article-title><source>Biol Methods Protoc</source><year>2025</year><volume>10</volume><issue>1</issue><fpage>bpaf017</fpage><pub-id pub-id-type="doi">10.1093/biomethods/bpaf017</pub-id><pub-id pub-id-type="medline">40078867</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Genovese</surname><given-names>A</given-names> </name><name name-style="western"><surname>Prabha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Borna</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Artificial intelligence for patient support: assessing retrieval-augmented generation for answering postoperative rhinoplasty questions</article-title><source>Aesthet Surg J</source><year>2025</year><month>06</month><day>16</day><volume>45</volume><issue>7</issue><fpage>735</fpage><lpage>744</lpage><pub-id pub-id-type="doi">10.1093/asj/sjaf038</pub-id><pub-id pub-id-type="medline">40088460</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Altorfer</surname><given-names>FCS</given-names> </name><name name-style="western"><surname>Kelly</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Avrumova</surname><given-names>F</given-names> </name><etal/></person-group><article-title>The double-edged sword of generative AI: surpassing an expert or a deceptive &#x201C;false friend&#x201D;?</article-title><source>Spine J</source><year>2025</year><month>08</month><volume>25</volume><issue>8</issue><fpage>1635</fpage><lpage>1643</lpage><pub-id pub-id-type="doi">10.1016/j.spinee.2025.02.010</pub-id><pub-id pub-id-type="medline">40049450</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Complete performance of retrieval-augmented generation metrics with exact question formulations.</p><media xlink:href="ai_v4i1e75262_app1.xlsx" xlink:title="XLSX File, 49 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>German synthetic data set with 100 questions and answers.</p><media xlink:href="ai_v4i1e75262_app2.xlsx" xlink:title="XLSX File, 85 KB"/></supplementary-material></app-group></back></article>