<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e78436</article-id><article-id pub-id-type="doi">10.2196/78436</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating the Reliability and Accuracy of an AI-Powered Search Engine in Providing Responses on Dietary Supplements: Quantitative and Qualitative Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Liu</surname><given-names>Mingxin</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Okuhara</surname><given-names>Tsuyoshi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shirabe</surname><given-names>Ritsuko</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nishiie</surname><given-names>Yuriko</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Yinghan</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Okada</surname><given-names>Hiroko</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kiuchi</surname><given-names>Takahiro</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Health Communication, Graduate School of Medicine, The University of Tokyo</institution><addr-line>Bunkyo, Hongo 7-3-1</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Health Communication, School of Public Health, Graduate School of Medicine, The University of Tokyo</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff3"><institution>University Hospital Medical Information Network (UMIN) Center, University of Tokyo Hospital</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff4"><institution>Graduate School of Human Sciences, Waseda University</institution><addr-line>Tokorozawa</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Wang</surname><given-names>Yanshan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gabriels</surname><given-names>Gary</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Dol</surname><given-names>Justine</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Shalaby</surname><given-names>Mohammed Nader</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Grace-Farfaglia</surname><given-names>Patricia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mingxin Liu, MA, Department of Health Communication, Graduate School of Medicine, The University of Tokyo, Bunkyo, Hongo 7-3-1, Tokyo, 113-8655, Japan, 81 03-5800-6549; <email>liumingxin98@akane.waseda.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>29</day><month>10</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e78436</elocation-id><history><date date-type="received"><day>02</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>19</day><month>09</month><year>2025</year></date><date date-type="accepted"><day>01</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Mingxin Liu, Tsuyoshi Okuhara, Ritsuko Shirabe, Yuriko Nishiie, Yinghan Xu, Hiroko Okada, Takahiro Kiuchi. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 29.10.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e78436"/><abstract><sec><title>Background</title><p>The widespread adoption of artificial intelligence (AI)&#x2013;powered search engines has transformed how people access health information. Microsoft Copilot, formerly Bing Chat, offers real-time web-sourced responses to user queries, raising concerns about the reliability of its health content. This is particularly critical in the domain of dietary supplements, where scientific consensus is limited and online misinformation is prevalent. Despite the popularity of supplements in Japan, little is known about the accuracy of AI-generated advice on their effectiveness for common diseases.</p></sec><sec><title>Objective</title><p>We aimed to evaluate the reliability and accuracy of Microsoft Copilot, an AI search engine, in responding to health-related queries about dietary supplements. Our findings can help consumers use large language models more safely and effectively when seeking information on dietary supplements and support developers in improving large language models&#x2019; performance in this field.</p></sec><sec sec-type="methods"><title>Methods</title><p>We simulated typical consumer behavior by posing 180 questions (6 per supplement &#x00D7; 30 supplements) to Copilot&#x2019;s 3 response modes (creative, balanced, and precise) in Japanese. These questions addressed the effectiveness of supplements in treating 6 common conditions (cancer, diabetes, obesity, constipation, joint pain, and hypertension). We classified the AI search engine&#x2019;s answers as &#x201C;effective,&#x201D; &#x201C;uncertain,&#x201D; or &#x201C;ineffective&#x201D; and evaluated for accuracy against evidence-based assessments conducted by licensed physicians. We conducted a qualitative content analysis of the response texts and systematically examined the types of sources cited in all responses.</p></sec><sec sec-type="results"><title>Results</title><p>The proportion of Copilot responses claiming supplement effectiveness was 29.4% (53/180), 47.8% (86/180), and 45% (81/180) for the creative, balanced, and precise modes, respectively, whereas overall accuracy of the responses was low across all modes: 36.1% (65/180), 31.7% (57/180), and 31.7% (57/180) for creative, balanced, and precise, respectively. No significant difference was observed among the 3 modes (<italic>P=</italic>.59). Notably, 72.7% (2240/3081) of the citations came from unverified sources such as blogs, sales websites, and social media. Of the 540 responses analyzed, 54 (10%) contained at least 1 citation in which the cited source did not include or support the claim made by Copilot, indicating hallucinated content. Only 48.5% (262/540) of the responses included a recommendation to consult health care professionals. Among disease categories, the highest accuracy was found for cancer-related questions, likely due to lower misinformation prevalence.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This is the first study to assess Copilot&#x2019;s performance on dietary supplement information. Despite its authoritative appearance, Copilot frequently cited noncredible sources and provided ambiguous or inaccurate information. Its tendency to avoid definitive stances and align with perceived user expectations poses potential risks for health misinformation. These findings highlight the need for integrating health communication principles&#x2014;such as transparency, audience empowerment, and informed choice&#x2014;into the development and regulation of AI search engines to ensure safe public use.</p></sec></abstract><kwd-group><kwd>artificial intelligence search engine</kwd><kwd>AI search engine</kwd><kwd>Copilot</kwd><kwd>dietary supplements</kwd><kwd>health communication</kwd><kwd>health education</kwd><kwd>large language model</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>The rise of digital technologies has dramatically transformed how people access and evaluate health information [<xref ref-type="bibr" rid="ref1">1</xref>]. From a health communication perspective, tools such as artificial intelligence (AI)&#x2013;powered chatbots play an increasingly important role in shaping how individuals understand and make decisions regarding their health. As these tools become more integrated into everyday search behaviors, examining the accuracy and reliability of the health-related content they provide is crucial [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>One prominent example of such a tool is Microsoft&#x2019;s AI-powered conversational agent, originally launched as Bing Chat on February 7, 2023 [<xref ref-type="bibr" rid="ref4">4</xref>]. Later rebranded as Copilot in late 2023, this tool is now integrated across Microsoft platforms, including Bing, Edge, and Windows [<xref ref-type="bibr" rid="ref5">5</xref>]. Unlike traditional search engines that return a list of hyperlinks, Copilot (formerly Bing Chat) is designed to generate conversational responses by synthesizing information using GPT-4. It can retrieve information from the web in real time, providing users with direct answers that often include reference links [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Copilot differs from other large language models (LLMs) in several ways. First, unlike other LLMs with a cutoff date [<xref ref-type="bibr" rid="ref8">8</xref>], it can perform real-time web searches, increasing its risk of incorporating inaccurate or misleading online content [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Second, it is embedded directly into a widely used search engine, exposing a much larger and more general user base to AI-generated content. On the basis of Bing&#x2019;s scale, the health communication implications of misinformation are substantial [<xref ref-type="bibr" rid="ref11">11</xref>]. Third, Copilot provides reference links within its responses that users may perceive as credible and trustworthy, thereby potentially reinforcing inaccuracies [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Although Copilot has tens of millions of active users worldwide [<xref ref-type="bibr" rid="ref13">13</xref>], there is limited research on its reliability in the field of nutrition and dietary planning. One study assessing the accuracy of LLMs in generating kidney-friendly diet plans found that Bing Chat achieved an accuracy of 81%, which is equal to that of GPT-4 and significantly higher than the 66% accuracy of GPT-3.5 [<xref ref-type="bibr" rid="ref14">14</xref>]. Another study evaluating the ability of LLMs to identify the protein content of foods reported that Bing Chat achieved an accuracy of 63.6%, outperforming GPT-4, which had an accuracy of 60.6% [<xref ref-type="bibr" rid="ref15">15</xref>]. However, these findings reflect structured and well-established areas of nutritional science. In contrast, the field of dietary supplements is characterized by emerging research, conflicting claims, and a high prevalence of misinformation online [<xref ref-type="bibr" rid="ref16">16</xref>]. This makes it particularly challenging for real-time web-connected LLMs to generate reliable evidence-based content.</p><p>In Japan, approximately 50% of adults report regular or occasional dietary supplement use [<xref ref-type="bibr" rid="ref17">17</xref>]. Worldwide, the supplement market continues to grow rapidly, with consumers increasingly relying on these products for health maintenance and disease prevention [<xref ref-type="bibr" rid="ref18">18</xref>]. However, in many countries, dietary supplements are not as strictly regulated as pharmaceuticals, leaving users heavily dependent on internet-based information. A previous study found that the prevalence of misinformation regarding dietary supplements on the internet was significantly higher than that regarding many other health-related domains [<xref ref-type="bibr" rid="ref18">18</xref>]. Inaccurate information can lead to misinformed health decisions, unnecessary financial costs, and adverse outcomes. Moreover, although Copilot includes reference links in its responses, few studies have examined the trustworthiness of these sources. If misinformation is embedded in an AI-generated summary and in the referenced content, the risk to users is amplified [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>Therefore, a comprehensive evaluation of Copilot&#x2019;s reliability in the context of dietary supplement information is crucial for advancing health communication research and supporting safe and informed decision-making in everyday health practices.</p></sec><sec id="s1-2"><title>Study Aims and Objectives</title><p>We simulated Japanese consumers&#x2019; use of Copilot to inquire about dietary supplements to clarify the following issues:</p><list list-type="order"><list-item><p>&#x2003;What proportion of Copilot responses characterize a dietary supplement as effective, ineffective, or uncertain?</p></list-item><list-item><p>&#x2003;How does Copilot perform when responding to questions related to various disease categories?</p></list-item><list-item><p>&#x2003;To what extent are Copilot&#x2019;s responses accurate in the context of dietary supplement information?</p></list-item><list-item><p><named-content content-type="indent">&#x2003;</named-content>What types of sources does Copilot cite in its responses, and how trustworthy are these references?</p></list-item><list-item><p>&#x2003;What types of common errors appear in Copilot&#x2019;s answers, and how might these inaccuracies mislead users?</p></list-item><list-item><p>&#x2003;Are there any notable differences among responses generated by the 3 different versions of Copilot?</p></list-item></list><p>By exploring these research questions, we aimed to understand how internet-based LLMs respond to inquiries about dietary supplements and identify current limitations in their performance. The findings of this study will contribute to the development of more reliable AI in the future and support general consumers in making informed and responsible decisions regarding the use of AI tools.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dietary Supplement Keywords and Questions</title><p>We selected the top 30 dietary supplements by market share in Japan in 2023 as identified in chapter 4, &#x201C;Present Situation and Prospects of the Health Food Category Market,&#x201D; of the 2023 edition of Healthy Foods Market Stats and Prospects, Market Survey Edition, published by Yano Research Institute Ltd (<xref ref-type="other" rid="box1">Textbox 1</xref>) [<xref ref-type="bibr" rid="ref20">20</xref>]. All the supplements are available in Japan.</p><boxed-text id="box1"><title> Keywords of the 30 dietary supplements.</title><list list-type="bullet"><list-item><p><italic>Aojiru</italic></p></list-item><list-item><p><italic>Agaricus</italic></p></list-item><list-item><p><italic>Ginkgo biloba extract</italic></p></list-item><list-item><p><italic>Turmeric</italic></p></list-item><list-item><p><italic>Royal jelly</italic></p></list-item><list-item><p><italic>Ornithine</italic></p></list-item><list-item><p><italic>Oyster extract</italic></p></list-item><list-item><p><italic>Chlorella</italic></p></list-item><list-item><p><italic>Glucosamine</italic></p></list-item><list-item><p><italic>Chitin and chitosan</italic></p></list-item><list-item><p><italic>Ubiquinone</italic></p></list-item><list-item><p><italic>Chinese softshell turtle</italic></p></list-item><list-item><p><italic>Black vinegar</italic></p></list-item><list-item><p><italic>Squalene</italic></p></list-item><list-item><p><italic>Collagen</italic></p></list-item><list-item><p><italic>Oriental ginseng</italic></p></list-item><list-item><p><italic>Soy isoflavone</italic></p></list-item><list-item><p><italic>DHA</italic> and <italic>EHA</italic></p></list-item><list-item><p><italic>Garlic</italic></p></list-item><list-item><p><italic>Lactic acid bacteria</italic></p></list-item><list-item><p><italic>Hyaluronic acid</italic></p></list-item><list-item><p><italic>Vitamin E</italic></p></list-item><list-item><p><italic>Vitamin C</italic></p></list-item><list-item><p><italic>Placenta</italic></p></list-item><list-item><p><italic>Blueberry</italic> and <italic>bilberry</italic></p></list-item><list-item><p><italic>Prune</italic></p></list-item><list-item><p><italic>Propolis</italic></p></list-item><list-item><p><italic>Maca</italic></p></list-item><list-item><p><italic>Euglena</italic></p></list-item><list-item><p><italic>Calcium</italic></p></list-item></list></boxed-text><p>The National Institute of Health and Nutrition in Japan has released evaluations on the effectiveness of numerous dietary supplements across a wide range of health domains (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref21">21</xref>]. These systems include the circulatory and respiratory systems, digestive and hepatic systems, endocrine and diabetic conditions, reproductive and urinary systems, brain and sensory functions, immune responses, cancer and inflammatory conditions, musculoskeletal health, developmental processes, and obesity. On the basis of these classifications, we identified 6 common disease areas and developed the corresponding question sets.</p><p>For each dietary supplement, 6 questions were generated with reference to a report issued by the National Institute of Health and Nutrition (<xref ref-type="other" rid="box2">Textbox 2</xref>).</p><boxed-text id="box2"><title> The 6 questions presented to Copilot.</title><list list-type="bullet"><list-item><p>Question 1: &#x201C;Is [supplement name, eg, aojiru] effective against cancer?&#x201D;</p></list-item><list-item><p>Question 2: &#x201C;Is [supplement name, eg, aojiru] effective against diabetes?&#x201D;</p></list-item><list-item><p>Question 3: &#x201C;Is [supplement name, eg, aojiru] effective against obesity?&#x201D;</p></list-item><list-item><p>Question 4: &#x201C;Is [supplement name, eg, aojiru] effective against constipation?&#x201D;</p></list-item><list-item><p>Question 5: &#x201C;Is [supplement name, eg, aojiru] effective against joint pain?&#x201D;</p></list-item><list-item><p>Question 6: &#x201C;Is [supplement name, eg, aojiru] effective against hypertension?&#x201D;</p></list-item></list></boxed-text></sec><sec id="s2-2"><title>Tested LLMs and Data Collection</title><p>This study evaluated Copilot&#x2019;s 3 response modes (creative, balanced, and precise), which have since been integrated into a single mode, known as Copilot [<xref ref-type="bibr" rid="ref22">22</xref>]. The creative, balanced, and precise modes differ primarily in response style. The creative mode tends to generate longer, more exploratory answers; the precise mode produces concise and factual outputs; and the balanced mode lies between the other 2. All modes share the same underlying model and search results [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. The data were collected between July and September 2023. Each question was posed once to each of the 3 Copilot modes, and the generated responses were recorded in a Microsoft Excel spreadsheet. To prevent potential context carryover, the chat window was closed after each query, and a new session was opened before submitting the next question. To ensure that the simulation reflected real-world user scenarios, we did not use prompts in our questions. To assess the reliability of the information sources cited by Copilot, 2 authors (ML and YX) collected all the referenced links from the responses and categorized their source types. Disagreements were resolved through discussion to reach a final consensus.</p><p>In addition, to prevent cross-interference between responses related to different dietary supplements, we closed the existing chat after completing the questions for one supplement and initiated a new conversation before proceeding to the next.</p></sec><sec id="s2-3"><title>Quantitative Analysis</title><p>The reports published by the National Institute of Health and Nutrition in Japan did not directly state whether a given dietary supplement was effective against a specific disease. Instead, a broad range of experimental studies were compiled that examined the effects of each supplement on various diseases. These studies used diverse methodologies, including randomized controlled trials, meta-analyses, and animal experiments. Consequently, the findings for the same supplement-disease pair may vary, with some studies reporting positive effects and others reporting no effects.</p><p>To address this variability, 2 licensed Japanese physicians (YN and RS) developed a comprehensive evaluation framework (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Using this framework and reports from the National Institute of Health and Nutrition, they assessed the evidence-based effectiveness of the 30 dietary supplements for the 6 diseases. Each outcome was categorized as &#x201C;effective,&#x201D; &#x201C;uncertain,&#x201D; or &#x201C;ineffective.&#x201D; In cases of conflicting evidence, consensus was reached through discussion.</p><p>The same two authors (YN and RS) then evaluated the responses generated by Copilot regarding the effectiveness of dietary supplements for the 6 diseases using a separate evaluation guideline they developed (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). These outcomes were similarly classified as &#x201C;effective,&#x201D; &#x201C;uncertain,&#x201D; and &#x201C;ineffective.&#x201D; The evaluation was conducted in a double-blind manner, and interrater agreement was measured using the Fleiss &#x03BA;. Disagreements were resolved through consensus.</p><p>Finally, the effectiveness stated by Copilot was compared with evidence-based assessments from scientific literature. Responses consistent with the reference assessment were classified as correct; all others were classified as incorrect. All the responses and classification results were recorded in Microsoft Excel (Office 2019 Professional Plus; 64 bits). To determine the statistical significance between groups, 2-tailed <italic>z</italic> tests were conducted [<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec><sec id="s2-4"><title>Qualitative Analysis</title><p>In addition to the quantitative assessment of Copilot&#x2019;s response accuracy, this study used a qualitative approach to analyze the content of the responses. All 540 responses were thoroughly reviewed, and the key characteristics and issues were systematically documented in Microsoft Excel. Common patterns and errors were identified, and relevant excerpts were cited in Japanese with English translations to support our findings.</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>All information used in this study was obtained from publicly available sources. Therefore, no ethical approval was required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Evidence-Based Effectiveness of Dietary Supplements</title><p>The results of the evaluation of the effectiveness of the 30 dietary supplements for the 6 diseases can be found in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. The &#x201C;A,&#x201D; &#x201C;B,&#x201D; and &#x201C;C&#x201D; designations correspond to &#x201C;effective,&#x201D; &#x201C;uncertain,&#x201D; and &#x201C;ineffective,&#x201D; respectively. Most dietary supplements had no or uncertain effects on the diseases. Among all supplements, only turmeric and hyaluronic acid were deemed effective for joint pain, whereas black vinegar was deemed effective for hypertension.</p></sec><sec id="s3-2"><title>Proposed Effectiveness of Dietary Supplements in Copilot Responses</title><p>The Fleiss &#x03BA; value measuring interrater agreement between the two evaluators was 0.70, indicating substantial consistency. The distribution of responses from the Copilot creative, balanced, and precise modes regarding the effectiveness of the 30 dietary supplements across the 6 diseases is shown in <xref ref-type="table" rid="table1">Table 1</xref>. Specifically, the proportion of responses indicating that the supplements were &#x201C;effective&#x201D; was 29.4% (53/180) for the creative mode, 47.8% (86/180) for the balanced mode, and 45% (81/180) for the precise mode. A statistically significant difference was observed between creative and the other two modes (creative vs balanced: <italic>P</italic>&#x003C;.001; creative vs precise: <italic>P</italic>=.002; balanced vs precise: <italic>P</italic>=.59). The creative, balanced, and precise modes generated 47.2% (85/180), 30% (54/180), and 30.6% (55/180) of responses categorized as &#x201C;uncertain,&#x201D; respectively. Similarly, the difference between creative and the other 2 modes was statistically significant (creative vs balanced: <italic>P</italic>&#x003C;.001; creative vs precise: <italic>P</italic>=.001; balanced vs precise: <italic>P</italic>=.90). The proportion of responses categorized as &#x201C;ineffective&#x201D; was 23.3% (42/180) for the creative mode, 22.2% (40/180) for the balanced mode, and 24.4% (44/180) for the precise mode. No significant differences were observed among the 3 modes in this category (<italic>P</italic>=.88).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Prevalence of responses from Copilot categorized as &#x201C;effective,&#x201D; &#x201C;uncertain,&#x201D; and &#x201C;ineffective&#x201D; for the 6 diseases.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Response category</td><td align="left" valign="bottom">Copilot creative mode, n (%)</td><td align="left" valign="bottom">Copilot balanced mode, n (%)</td><td align="left" valign="bottom">Copilot precise mode, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">&#x201C;Is [supplement name, eg, aojiru] effective against cancer?&#x201D; (n=30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effective</td><td align="left" valign="top">2 (6.7)</td><td align="left" valign="top">7 (23.3)</td><td align="left" valign="top">6 (20)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uncertain</td><td align="left" valign="top">20 (66.7)</td><td align="left" valign="top">16 (53.3)</td><td align="left" valign="top">15 (50)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineffective</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">7 (23.3)</td><td align="left" valign="top">9 (30)</td></tr><tr><td align="left" valign="top" colspan="4">&#x201C;Is [supplement name] effective against diabetes?&#x201D; (n=30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effective</td><td align="left" valign="top">4 (13.3)</td><td align="left" valign="top">15 (50)</td><td align="left" valign="top">12 (40)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uncertain</td><td align="left" valign="top">19 (63.3)</td><td align="left" valign="top">10 (33.3)</td><td align="left" valign="top">14 (46.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineffective</td><td align="left" valign="top">7 (23.3)</td><td align="left" valign="top">5 (16.7)</td><td align="left" valign="top">4 (13.3)</td></tr><tr><td align="left" valign="top" colspan="4">&#x201C;Is [supplement name] effective against obesity?&#x201D; (n=30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effective</td><td align="left" valign="top">9 (30)</td><td align="left" valign="top">17 (56.7)</td><td align="left" valign="top">16 (53.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uncertain</td><td align="left" valign="top">15 (50)</td><td align="left" valign="top">5 (16.7)</td><td align="left" valign="top">6 (20)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineffective</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">8 (26.7)</td></tr><tr><td align="left" valign="top" colspan="4">&#x201C;Is [supplement name] effective against constipation?&#x201D; (n=30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effective</td><td align="left" valign="top">10 (33.3)</td><td align="left" valign="top">21 (70)</td><td align="left" valign="top">17 (56.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uncertain</td><td align="left" valign="top">15 (50)</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">4 (13.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineffective</td><td align="left" valign="top">5 (16.7)</td><td align="left" valign="top">3 (10)</td><td align="left" valign="top">9 (30)</td></tr><tr><td align="left" valign="top" colspan="4">&#x201C;Is [supplement name] effective against joint pain?&#x201D; (n=30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effective</td><td align="left" valign="top">9 (30)</td><td align="left" valign="top">10 (33.3)</td><td align="left" valign="top">10 (33.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uncertain</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">9 (30)</td><td align="left" valign="top">9 (30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineffective</td><td align="left" valign="top">13 (43.3)</td><td align="left" valign="top">11 (36.7)</td><td align="left" valign="top">11 (36.7)</td></tr><tr><td align="left" valign="top" colspan="4">&#x201C;Is [supplement name] effective against hypertension?&#x201D; (n=30)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effective</td><td align="left" valign="top">19 (63.3)</td><td align="left" valign="top">16 (53.3)</td><td align="left" valign="top">20 (66.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uncertain</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">7 (23.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineffective</td><td align="left" valign="top">3 (10)</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">3 (10)</td></tr><tr><td align="left" valign="top" colspan="4">Total answers (n=180)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effective</td><td align="left" valign="top">53 (29.4)</td><td align="left" valign="top">86 (47.8)</td><td align="left" valign="top">81 (45)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uncertain</td><td align="left" valign="top">85 (47.2)</td><td align="left" valign="top">54 (30)</td><td align="left" valign="top">55 (30.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineffective</td><td align="left" valign="top">42 (23.3)</td><td align="left" valign="top">40 (22.2)</td><td align="left" valign="top">44 (24.4)</td></tr></tbody></table></table-wrap><p>When analyzed by disease, Copilot gave the fewest responses that indicated supplement effectiveness for cancer treatment, with 17/% (15/90) of the responses across all 3 modes. In contrast, the highest proportion of responses suggesting effectiveness was observed for hypertension, with 61% (55/90) of the responses.</p><p>Responses indicating supplement ineffectiveness were lowest for hypertension (12/90, 13%) and highest for joint pain (35/90, 39%).</p><p>Regarding responses classified as &#x201C;uncertain,&#x201D; hypertension had the fewest (23/90, 26%), whereas cancer had the most (51/90, 57% across all modes).</p></sec><sec id="s3-3"><title>Accuracy of Copilot Responses</title><p>The accuracy rates of the Copilot responses are summarized in <xref ref-type="table" rid="table2">Table 2</xref>. Overall, the accuracies of the creative, balanced, and precise modes were 36.1% (65/180), 31.7% (57/180), and 31.7% (57/180), respectively, with no significant differences among them (<italic>P</italic>=.59).</p><p>When examined by disease category, Copilot showed the highest average accuracy for cancer-related questions (44/90, 49%). In contrast, the lowest accuracy was observed for constipation-related questions with 19% (17/90).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Correctness of the responses for different diseases.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Copilot creative mode, n (%)</td><td align="left" valign="bottom">Copilot balanced mode, n (%)</td><td align="left" valign="bottom">Copilot precise mode, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">&#x201C;Is [supplement name] effective against cancer?&#x201D; (n=30)</td><td align="left" valign="top">17 (56.7)</td><td align="left" valign="top">14 (46.7)</td><td align="left" valign="top">13 (43.3)</td></tr><tr><td align="left" valign="top">&#x201C;Is [supplement name] effective against diabetes?&#x201D; (n=30)</td><td align="left" valign="top">13 (43.3)</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">10 (33.3)</td></tr><tr><td align="left" valign="top">&#x201C;Is [supplement name] effective against obesity?&#x201D; (n=30)</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">8 (26.7)</td><td align="left" valign="top">7 (23.3)</td></tr><tr><td align="left" valign="top">&#x201C;Is [supplement name] effective against constipation?&#x201D; (n=30)</td><td align="left" valign="top">5 (16.7)</td><td align="left" valign="top">3 (10)</td><td align="left" valign="top">9 (30)</td></tr><tr><td align="left" valign="top">&#x201C;Is [supplement name] effective against joint pain?&#x201D; (n=30)</td><td align="left" valign="top">16 (53.3)</td><td align="left" valign="top">14 (46.7)</td><td align="left" valign="top">13 (43.3)</td></tr><tr><td align="left" valign="top">&#x201C;Is [supplement name] effective against hypertension?&#x201D; (n=30)</td><td align="left" valign="top">6 (20)</td><td align="left" valign="top">11 (36.7)</td><td align="left" valign="top">5 (16.7)</td></tr><tr><td align="left" valign="top">Total (n=180)</td><td align="left" valign="top">65 (36.1)</td><td align="left" valign="top">57 (31.7)</td><td align="left" valign="top">57 (31.7)</td></tr></tbody></table></table-wrap></sec><sec id="s3-4"><title>Sources Cited in Copilot Responses</title><p>Across 540 responses (3 modes &#x00D7; 30 dietary supplements &#x00D7; 6 diseases), Copilot cited 3081 links, averaging 5.7 sources per response. These sources were categorized into 2 major groups and 14 subcategories (<xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). One of the major groups was unverified sources, accounting for 72.7% (2240/3081) of all citations, and included the following subcategories: Bing search page, introduction of food or pharmaceutical sales websites (mainly third-party commercial websites and excluding manufacturers), unregulated medical knowledge websites (eg, blogs or personal articles), Amazon product page for the supplement, other social media platforms (X [formerly known as Twitter], Facebook, YouTube, and Zhihu), and invalid links. The other major group was verified sources, which made up the remaining 27.3% (841/3081) of the citations and included food and pharmaceutical manufacturer websites, hospital and clinic websites, news, Wikipedia, government websites (eg, the Ministry of Health, Labour, and Welfare and local governments), individual research introduction websites (eg, Nature, PubMed, J-GLOBAL, RIKEN, university research highlights, and laboratory websites), pharmacist or medical association websites, and academic conferences.</p><p>Among all the subcategories, unregulated medical knowledge websites were the most frequently cited, with 61.4% (1893/3081) of the citations. Other categories with &#x003E;5% of the total citations included food and pharmaceutical manufacturer websites (241/3081, 7.8% of the citations), Bing search pages (236/3081, 7.7% of the citations), and individual research introduction websites (224/3081, 7.3% of the citations). News, Wikipedia, product sales websites, hospital and clinic websites, and government websites each accounted for 1% to 5% of the citations. Sources such as pharmacist or medical association websites, Amazon, other social media platforms, academic conferences, and invalid links each accounted for &#x003C;1% of the total citations.</p><p>From a disease-specific perspective, the proportion of citations from unregulated medical knowledge websites was highest for obesity and constipation, accounting for 68.9% (367/533) and 69.7% (347/498), respectively. These rates were significantly higher than those for other diseases (cancer vs obesity: <italic>P</italic>&#x003C;.001; diabetes vs obesity: <italic>P</italic>&#x003C;.001; joint pain vs obesity: <italic>P</italic>=.007; hypertension vs obesity: <italic>P</italic>=.008; cancer vs constipation: <italic>P</italic>&#x003C;.001; diabetes vs constipation: <italic>P</italic>&#x003C;.001; joint pain vs constipation: <italic>P</italic>=.004; hypertension vs constipation: <italic>P</italic>=.004). No significant differences were observed in the citation proportions of other source categories across different diseases.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Categories of websites cited by Copilot.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Categories</td><td align="left" valign="top">Total, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Unverified websites</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Bing search page</td><td align="left" valign="top">236 (7.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Invalid links</td><td align="left" valign="top">24 (0.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Unregulated medical knowledge websites (eg, blogs, personal articles)</td><td align="left" valign="top">1893 (61.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Introduction of food/pharmaceutical sales websites (excluding manufacturers)</td><td align="left" valign="top">67 (2.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Amazon</td><td align="left" valign="top">11 (0.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other social media platforms (Twitter, Facebook, YouTube, Zhihu)</td><td align="left" valign="top">10 (0.3)</td></tr><tr><td align="left" valign="top">Verified websites</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Food and pharmaceutical manufacture websites</td><td align="left" valign="top">241 (7.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hospital and clinic websites</td><td align="left" valign="top">123 (4.0)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Government websites (eg, Ministry of Health, Labour and Welfare, local governments)</td><td align="left" valign="top">46 (1.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Individual research introduction websites (eg, Nature, PubMed, J-Global, RIKEN, university research highlights, lab websites)</td><td align="left" valign="top">224 (7.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Pharmacists&#x2019; or medical associations websites</td><td align="left" valign="top">13 (0.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Academic conferences</td><td align="left" valign="top">7 (0.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>News</td><td align="left" valign="top">99 (3.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Wikipedia</td><td align="left" valign="top">87 (2.8)</td></tr></tbody></table></table-wrap></sec><sec id="s3-5"><title>Content Analysis</title><p>Copilot responses generally followed a 3-part template. The first section provided a direct answer to the question of whether a specific supplement was effective against a given disease. When expressing a positive stance, Copilot often used highly assertive language, such as &#x201C;Yes, [supplement] is effective for [disease].&#x201D; However, when presenting a negative view, it frequently used more ambiguous phrasing&#x2014;for example, &#x201C;Although there is no scientific evidence supporting the supplement&#x2019;s effectiveness for the disease, it may still have potential benefits&#x201D; or &#x201C;While it does not act directly on the condition, it might exert indirect effects.&#x201D;</p><p>The second section typically offered a detailed description of the nutritional components of the supplements and their potential physiological effects. This section included citations from several online sources. However, upon reviewing these sources individually, we found that some did not support the claims made in the corresponding Copilot responses.</p><p>The third section served as a summary of the overall responses. In many cases, this section partially replicated the opinions stated in the first section. However, this approach often introduced additional statements that diluted or contradicted an initial stance. For example, even if the first section endorsed the supplement&#x2019;s effectiveness, the summary might include phrases such as &#x201C;The effects of the supplement may vary from person to person,&#x201D; &#x201C;There is no definitive conclusion&#x2014;some studies support its benefits, while others do not,&#x201D; or &#x201C;Excessive intake of the supplement may have adverse effects,&#x201D; thereby leaning toward a more skeptical or cautious tone. In addition, of the 540 responses, only 262 (48.5%) included a recommendation in the third section for users to consult a health care professional.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our study is the first to comprehensively evaluate an AI search engine&#x2019;s response quality on supplement-related queries. Overall, the AI search engine cited numerous unverified websites and achieved an accuracy of approximately 33.1% (179/540).</p><p>The number of responses indicating that dietary supplements were ineffective was nearly identical across the 3 modes of Copilot: 23.3% (42/180) for the creative mode, 22.2% (40/180) for the balanced mode, and 24.4% (44/180) for the precise mode. The balanced and precise modes yielded a higher number of responses that indicated supplement effectiveness&#x2014;47.8% (86/180) and 45% (81/180), respectively&#x2014;whereas the proportion of &#x201C;uncertain&#x201D; responses was 30% (54/180) and 30.6% (55/180), respectively. In contrast, the creative mode showed fewer &#x201C;effective&#x201D; responses (53/180, 29.4%) and more &#x201C;uncertain&#x201D; ones (85/180, 47.2%). Overall, the balanced and precise modes exhibited similar response patterns, whereas the creative mode was more cautious, with a higher proportion of uncertain responses and fewer confident claims of effectiveness.</p><p>When comparing Copilot&#x2019;s reported effectiveness with evidence-based evaluations, both the balanced and precise modes demonstrated an accuracy of 31.7% (57/180), whereas the creative mode achieved a slightly higher accuracy of 36.1% (65/180); however, this difference was not statistically significant (<italic>P</italic>=.38). Generally, the accuracy of all 3 modes was suboptimal, with none exceeding 40%, and all fell well below the accuracy levels reported in previous studies that tested Copilot in the domain of dietary planning and nutrition [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>In addition, we conducted a detailed review and classification of all the cited sources in the Copilot responses, which makes this the first study to systematically analyze citation quality. We found that 72.7% (2240/3081) of the citations came from unverified or nonauthoritative sources, including the Bing search page; introduction of food or pharmaceutical sales websites (excluding manufacturers); unregulated medical knowledge websites (eg, blogs or opinion pieces); Amazon; other social media platforms (X [formerly known as Twitter], Facebook, YouTube, and Zhihu); and invalid links. In contrast, only 27.3% (840/3081) of the citations came from verified and credible sources, such as news; Wikipedia; food and pharmaceutical company websites; hospital and clinic websites; government websites (eg, the Ministry of Health, Labour, and Welfare and local governments); individual research introduction websites (eg, Nature, PubMed, J-GLOBAL, RIKEN, university research highlights, and laboratory websites); pharmacist or medical association websites, and academic conferences. Among all the citation categories, unregulated medical knowledge websites were the most frequently cited across all 3 modes. This finding suggests that the commercial purpose of many unregulated sites makes them more biased and less authoritative as nutritional supplements are over-the-counter consumer items. Moreover, web-scraped datasets often contain advertising content from social media and online articles. Notably, for cancer-related queries, the proportion of such unregulated sources was lower at approximately 54.5% (307/563). In contrast, approximately 79.25% (714/1031) of the sources for obesity (68.9%, 367/533) and constipation (70%, 347/498) fell into this unverified category. This substantial imbalance highlights a key concern in health communication: AI-generated content may present unverified sources in a polished, authoritative format, creating a &#x201C;credibility illusion&#x201D; that enhances user trust while disseminating misinformation. This illusion poses a particular risk in public health contexts where information reliability is essential for informed decision-making.</p><p>We believe that the low accuracy of Copilot in the field of dietary supplements can be attributed to several key factors. First, dietary planning and nutrition are domains characterized by well-structured knowledge with clearly established guidelines and recommendations grounded in scientific consensus. Questions in nutrition and medical areas tend to have definitive answers and logical reasoning paths that align well with the strengths of LLMs [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. However, the effectiveness of dietary supplements remains a subject of scientific controversy [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. For example, studies on glucosamine&#x2019;s effects on joint health include both positive and negative findings even among meta-analyses [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Second, the prevalence of inaccurate or misleading information is significantly lower in the dietary planning and nutrition domains than in the dietary supplement domain [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. The former is typically documented in textbooks, peer-reviewed literature, and official guidelines, whereas the latter often includes a wide range of unverified claims from advertisements and personal blogs [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. Our study found that 72.7% (2240/3081) of the sources cited by Copilot were from unverified websites. Because the internet serves as a major component of LLM training data, the abundance of unverified content likely affects the model&#x2019;s response accuracy in this domain. Third, unlike ChatGPT, which screens and synthesizes information from its training data, Bing is fundamentally a search engine that quotes content directly from web pages. Consequently, erroneous or misleading information may be presented to users without screening. Notably, we observed that the creative mode achieved higher accuracy than the balanced and precise modes. This may be due to its higher-temperature setting, which allows for more exploratory reasoning and the integration of information from multiple sources [<xref ref-type="bibr" rid="ref34">34</xref>]. Paradoxically, when training data contain a large volume of inaccurate or conflicting content, as is common in the dietary supplement domain, a higher temperature may enable the model to reason beyond dominant but incorrect narratives, thus improving the response quality [<xref ref-type="bibr" rid="ref34">34</xref>]. In contrast, the balanced and precise modes may rely more heavily on conservative surface-level content from citations, leading to less accurate responses. Therefore, we hypothesized that, in complex and controversial domains, higher-temperature models may perform better by generating responses through more bold and exploratory reasoning and a broader synthesis of information.</p><p>Across the different disease categories, all 3 Copilot modes provided the fewest responses that indicated the effectiveness of dietary supplements for cancer treatment and achieved the highest accuracy for this disease. One possible explanation is that, owing to the complexity and severity of cancer, medical information related to its treatment is subject to strict scrutiny and regulation [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. Compared with compared with other health topics, for other conditions, there is a growing consensus among authoritative organizations and scientific literature that emphasizes the lack of credible evidence supporting the use of dietary supplements in cancer therapy [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. As Copilot is trained on such reliable sources, it is more likely to adopt a cautious stance when addressing cancer-related questions. Furthermore, a Japanese study found no advertisements promoting dietary supplements as effective cancer treatments, suggesting that cancer&#x2014;being a serious and life-threatening disease&#x2014;is rarely the focus of supplement marketing [<xref ref-type="bibr" rid="ref38">38</xref>]. Consequently, misinformation about supplements in the context of cancer is likely to be less prevalent than in other disease areas. This relative scarcity of misleading information in the training data may reduce the chances of LLMs incorrectly asserting that supplements are effective against cancer.</p><p>In addition to quantitative analysis, we conducted a thorough qualitative review of Copilot responses and found that they followed a highly templated structure: the first section provided a 1-sentence answer to the question, the second section elaborated on the components of the dietary supplement and their potential effects, and the third section offered a summary.</p><p>In the first section, Copilot consistently leaned toward affirming, or at least not fully denying, the effectiveness of dietary supplements. Even when initially rejecting the efficacy of a supplement, the response was often followed by hedging statements such as &#x201C;it may still have some benefits&#x201D; or &#x201C;it could exert indirect effects.&#x201D; We interpret this as a result of the model&#x2019;s tendency to align its responses with the users&#x2019; implicit expectations by providing more positive information. This behavior is consistent with the well-documented phenomenon of sycophancy in LLMs, where the model adjusts its outputs to reflect the tone or assumptions of user inputs, sometimes at the expense of factual accuracy [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. This tendency is particularly problematic in domains such as dietary supplements, where scientific evidence is often inconclusive or disputed. Ambiguous responses catering to user expectations may inadvertently mislead consumers. Therefore, we recommend that future model updates prioritize reducing such ambiguity to minimize the risk of misinformation and improve the reliability of health-related AI responses.</p><p>In the second section, Copilot typically cited several websites to support the claims made in the first section. However, 2 notable and concerning patterns were identified. First, in most cases, the model directly repeated claims from the cited sources without conducting any meaningful synthesis or critical evaluation. Consequently, misleading or inaccurate information from nonauthoritative websites was often presented to users without a filter. Second, upon reviewing the linked content individually, we discovered that approximately 9.6% (52/540) of the responses included fabricated claims&#x2014;statements in Copilot&#x2019;s responses were attributed to sources that did not contain such information. In most of these cases, Copilot suggested a health effect of a dietary supplement, yet the cited link provided no evidence or mention of that specific claim. Copilot likely exhibited AI hallucinations, a phenomenon in which the model generates content that appears plausible but is factually incorrect or entirely fabricated [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref43">43</xref>]. In these cases, the model appears to have first &#x201C;invented&#x201D; a claim about a supplement&#x2019;s effect and then &#x201C;fabricated&#x201D; support for that claim by attaching existing but unrelated or irrelevant citations. The citation itself is real; however, the information it contains does not support the model&#x2019;s opinion [<xref ref-type="bibr" rid="ref44">44</xref>].</p><p>In the third section, we observed that many responses diluted the affirmative claims presented in the first section. For instance, the model frequently added statements such as &#x201C;the effects may vary between individuals&#x201D; or &#x201C;excessive intake may cause adverse effects.&#x201D; This suggests that Copilot adopts a cautious and self-protective response strategy characterized by compromise and hedging. This conservative generation strategy is particularly prevalent when dealing with sensitive topics such as health, medicine, and nutrition [<xref ref-type="bibr" rid="ref45">45</xref>]. Additionally, we found that only 48.5% (262/540) of all responses included recommendations for users to consult health care professionals. We believe that, for medicine-related queries, it would be more appropriate to universally include recommendations for users to follow professional medical advice.</p><p>From a health communication perspective, our findings underscore the dual role of internet-based AI tools such as Copilot in shaping the public understanding of health information. On the one hand, AI-generated responses can present complex medical content in a simplified and accessible manner, potentially lowering barriers to health literacy and supporting informed decision-making, particularly among populations with limited access to professional health care resources. However, our analysis revealed that Copilot frequently cites unverified sources and uses ambiguous or overly agreeable language when addressing health-related queries. This may contribute to a &#x201C;credibility illusion,&#x201D; whereby users perceive AI-generated content as trustworthy owing to its polished presentation and apparent authority regardless of its actual evidentiary basis. In domains such as dietary supplements, where scientific evidence is frequently inconclusive and commercial interest is strong, this illusion poses a significant risk of misinformation. Moreover, such dynamics can exacerbate existing health information asymmetries by disproportionately affecting users who cannot critically assess online content quality. These findings highlight the urgent need to embed the core principles of health communication, such as informed choice, audience empowerment, and transparency, into the design and governance of AI systems. Only through such efforts can these technologies fulfill their promise as facilitators of public health rather than inadvertent amplifiers of health misinformation.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study evaluated Copilot&#x2019;s creative, balanced, and precise modes, which were later merged into a single default mode. Reports suggest that the 3 modes mainly differed in style parameters (eg, verbosity and creativity) rather than underlying model architecture. Although explicit mode switches have been removed, the unified mode likely preserves a blended style; therefore, our accuracy and citation results remain informative [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>Second, this study focused on evaluating Copilot&#x2019;s performance in a Japanese-language context and did not assess its accuracy in other languages or cultural settings. As a multilingual tool, the performance of Copilot may vary depending on linguistic and cultural factors, as well as on the distribution of its training data. Given that English is likely to be overrepresented in the training corpus, responses in non-English languages such as Japanese may be less accurate because of limited high-quality data.</p></sec><sec id="s4-3"><title>Conclusions</title><p>This study is the first to evaluate the performance of an AI search engine in the dietary supplement domain. Overall, the results were suboptimal. Copilot affirmed the effectiveness of dietary supplements in approximately 40.7% (220/540) of the responses, yet the overall accuracy was only approximately 33.1% (179/540). The creative mode performed slightly better than the others, achieving an accuracy of 36.1% (65/180), suggesting that higher-temperature LLMs may perform better in complex domains. The primary challenge for Copilot seems to arise from the controversial and inconclusive nature of scientific evidence regarding dietary supplements. Notably, 72.7% (2240/3081) of the sources cited, such as personal blogs and sales websites, were unverified. Copilot frequently quoted information from these sources without proper screening and, in some cases, even attributed claims to sources that did not support them. In terms of response style, Copilot tended to adopt a conservative and hedging tone, often avoiding a clear affirmation or denial of supplement effectiveness. This reflects a tendency toward sycophancy, in which responses align with perceived user expectations. Finally, only half (262/540, 48.5%) of the responses included recommendations to consult medical professionals. We believe that all health-related answers should include such guidance to ensure user safety and responsibility.</p></sec></sec></body><back><ack><p>This work was supported by the Japan Society for the Promotion of Science Grant-in-Aid for Scientific Research (24KJ0830).</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name></person-group><article-title>Consumers&#x2019; evaluation of web-based health information quality: meta-analysis</article-title><source>J Med Internet Res</source><year>2022</year><month>04</month><day>28</day><volume>24</volume><issue>4</issue><fpage>e36463</fpage><pub-id pub-id-type="doi">10.2196/36463</pub-id><pub-id pub-id-type="medline">35482390</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nickel</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ayre</surname><given-names>J</given-names> </name><name name-style="western"><surname>Marinovich</surname><given-names>ML</given-names> </name><etal/></person-group><article-title>Are AI chatbots concordant with evidence-based cancer screening recommendations?</article-title><source>Patient Educ Couns</source><year>2025</year><month>05</month><volume>134</volume><fpage>108677</fpage><pub-id pub-id-type="doi">10.1016/j.pec.2025.108677</pub-id><pub-id pub-id-type="medline">39862490</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name></person-group><article-title>Utility of ChatGPT in clinical practice</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>28</day><volume>25</volume><fpage>e48568</fpage><pub-id pub-id-type="doi">10.2196/48568</pub-id><pub-id pub-id-type="medline">37379067</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="web"><article-title>Microsoft launches the new Bing, with ChatGPT built in</article-title><source>TechCrunch</source><access-date>2025-01-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://techcrunch.com/2023/02/07/microsoft-launches-the-new-bing-with-chatgpt-built-in/">https://techcrunch.com/2023/02/07/microsoft-launches-the-new-bing-with-chatgpt-built-in/</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><article-title>Microsoft rebrands Bing Chat to Copilot, to better compete with ChatGPT</article-title><source>The Verge</source><access-date>2025-01-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.theverge.com/2023/11/15/23960517/microsoft-copilot-bing-chat-rebranding-chatgpt-ai?utm_source=chatgpt.com">https://www.theverge.com/2023/11/15/23960517/microsoft-copilot-bing-chat-rebranding-chatgpt-ai?utm_source=chatgpt.com</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Laukkonen</surname><given-names>J</given-names> </name></person-group><article-title>Microsoft Copilot: what it is, where to find it, and how much it costs</article-title><source>Lifewire Tech for Humans</source><access-date>2025-01-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.lifewire.com/what-is-bing-ai-chatbot-7371141">https://www.lifewire.com/what-is-bing-ai-chatbot-7371141</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kelly</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cornwell</surname><given-names>SE</given-names> </name><etal/></person-group><article-title>Bing chat: the future of search engines?</article-title><source>Proc Assoc Inf Sci Technol</source><year>2023</year><month>10</month><volume>60</volume><issue>1</issue><fpage>1007</fpage><lpage>1009</lpage><pub-id pub-id-type="doi">10.1002/pra2.927</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>GPT-4</article-title><source>OpenAI</source><access-date>2025-02-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/models/gpt-4">https://platform.openai.com/docs/models/gpt-4</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kuznetsova</surname><given-names>E</given-names> </name><name name-style="western"><surname>Makhortykh</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vziatysheva</surname><given-names>V</given-names> </name><name name-style="western"><surname>Stolze</surname><given-names>M</given-names> </name><name name-style="western"><surname>Baghumyan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Urman</surname><given-names>A</given-names> </name></person-group><article-title>In generative AI we trust: can chatbots effectively verify political information?</article-title><source>J Comput Soc Sc</source><year>2025</year><month>02</month><volume>8</volume><issue>1</issue><fpage>15</fpage><pub-id pub-id-type="doi">10.1007/s42001-024-00338-8</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ashraf</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Mackey</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Fittler</surname><given-names>A</given-names> </name></person-group><article-title>Search engines and generative artificial intelligence integration: public health risks and recommendations to safeguard consumers online</article-title><source>JMIR Public Health Surveill</source><year>2024</year><month>03</month><day>21</day><volume>10</volume><fpage>e53086</fpage><pub-id pub-id-type="doi">10.2196/53086</pub-id><pub-id pub-id-type="medline">38512343</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Top websites ranking</article-title><source>Similarweb</source><access-date>2025-01-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.similarweb.com/engines/">https://www.similarweb.com/engines/</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>Generative AI &#x0026; legal research</article-title><source>Delaware Law School, Widener University</source><access-date>2025-09-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://libguides.law.widener.edu/c.php?g=1342893&#x0026;p=10140129">https://libguides.law.widener.edu/c.php?g=1342893&#x0026;p=10140129</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><article-title>Microsoft Copilot revenue and usage statistics (2025)</article-title><source>Business of Apps</source><year>2025</year><access-date>2025-09-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.businessofapps.com/data/microsoft-copilot-statistics/?utm_source=chatgpt.com">https://www.businessofapps.com/data/microsoft-copilot-statistics/?utm_source=chatgpt.com</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qarajeh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tangpanithandee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Thongprayoon</surname><given-names>C</given-names> </name><etal/></person-group><article-title>AI-powered renal diet support: performance of ChatGPT, Bard AI, and Bing Chat</article-title><source>Clin Pract</source><year>2023</year><month>09</month><day>26</day><volume>13</volume><issue>5</issue><fpage>1160</fpage><lpage>1172</lpage><pub-id pub-id-type="doi">10.3390/clinpract13050104</pub-id><pub-id pub-id-type="medline">37887080</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bayram</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Ozturkcan</surname><given-names>A</given-names> </name></person-group><article-title>AI showdown: info accuracy on protein quality content in foods from ChatGPT 3.5, ChatGPT 4, Bard AI and Bing chat</article-title><source>Br Food J</source><year>2024</year><month>08</month><day>15</day><volume>126</volume><issue>9</issue><fpage>3335</fpage><lpage>3346</lpage><pub-id pub-id-type="doi">10.1108/BFJ-02-2024-0158</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Dietary supplements market size and share report</article-title><source>Grand View Research</source><access-date>2025-10-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.grandviewresearch.com/industry-analysis/dietary-supplements-market-report">https://www.grandviewresearch.com/industry-analysis/dietary-supplements-market-report</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Japan: share of people taking dietary supplements 2023</article-title><source>Statista</source><access-date>2025-10-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.statista.com/statistics/1182707/japan-share-of-people-taking-dietary-supplements/">https://www.statista.com/statistics/1182707/japan-share-of-people-taking-dietary-supplements/</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Denniss</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lindberg</surname><given-names>R</given-names> </name><name name-style="western"><surname>McNaughton</surname><given-names>SA</given-names> </name></person-group><article-title>Quality and accuracy of online nutrition-related information: a systematic review of content analysis studies</article-title><source>Public Health Nutr</source><year>2023</year><month>07</month><volume>26</volume><issue>7</issue><fpage>1345</fpage><lpage>1357</lpage><pub-id pub-id-type="doi">10.1017/S1368980023000873</pub-id><pub-id pub-id-type="medline">37138366</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alber</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Alyakin</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Medical large language models are vulnerable to data-poisoning attacks</article-title><source>Nat Med</source><year>2025</year><month>02</month><volume>31</volume><issue>2</issue><fpage>618</fpage><lpage>626</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03445-1</pub-id><pub-id pub-id-type="medline">39779928</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>Healthy foods market stats and prospects: market survey edition [Web page in Japanese]</article-title><source>Yano Research Institute Ltd</source><year>2023</year><access-date>2025-09-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.yano.co.jp/market_reports/C64130600">https://www.yano.co.jp/market_reports/C64130600</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Health food [Web page in Japanese]</article-title><source>National Institute of Health and Nutrition</source><access-date>2023-11-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nibiohn.go.jp/eiken/info/hf2.html">https://www.nibiohn.go.jp/eiken/info/hf2.html</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Copilot</article-title><source>Microsoft</source><access-date>2023-11-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://copilot.microsoft.com/">https://copilot.microsoft.com/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>What are the conversation styles of Microsoft Copilot?</article-title><source>My FSU Service Center</source><access-date>2025-09-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://servicecenter.fsu.edu/s/article/What-are-the-conversation-styles-of-Microsoft-Copilot">https://servicecenter.fsu.edu/s/article/What-are-the-conversation-styles-of-Microsoft-Copilot</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Introduction to Microsoft Copilot: the AI assistant for Microsoft 365</article-title><source>The Inform Team</source><access-date>2025-09-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.theinformteam.com/blog/introduction-to-microsoft-copilot/">https://www.theinformteam.com/blog/introduction-to-microsoft-copilot/</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pandis</surname><given-names>N</given-names> </name></person-group><article-title>Comparison of 2 means (independent z test or independent t test)</article-title><source>Am J Orthod Dentofacial Orthop</source><year>2015</year><month>08</month><volume>148</volume><issue>2</issue><fpage>350</fpage><lpage>351</lpage><pub-id pub-id-type="doi">10.1016/j.ajodo.2015.05.012</pub-id><pub-id pub-id-type="medline">26232845</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT across different versions in medical licensing examinations worldwide: systematic review and meta-analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>07</month><day>25</day><volume>26</volume><fpage>e60807</fpage><pub-id pub-id-type="doi">10.2196/60807</pub-id><pub-id pub-id-type="medline">39052324</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Evaluating the effectiveness of advanced large language models in medical knowledge: a comparative study using Japanese national medical examination</article-title><source>Int J Med Inform</source><year>2025</year><month>01</month><volume>193</volume><fpage>105673</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105673</pub-id><pub-id pub-id-type="medline">39471700</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Large language models in dental licensing examinations: systematic review and meta-analysis</article-title><source>Int Dent J</source><year>2025</year><month>02</month><volume>75</volume><issue>1</issue><fpage>213</fpage><lpage>222</lpage><pub-id pub-id-type="doi">10.1016/j.identj.2024.10.014</pub-id><pub-id pub-id-type="medline">39532572</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Takagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Watari</surname><given-names>T</given-names> </name><name name-style="western"><surname>Erabi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name></person-group><article-title>Performance of GPT-3.5 and GPT-4 on the Japanese medical licensing examination: comparison study</article-title><source>JMIR Med Educ</source><year>2023</year><month>06</month><day>29</day><volume>9</volume><fpage>e48002</fpage><pub-id pub-id-type="doi">10.2196/48002</pub-id><pub-id pub-id-type="medline">37384388</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Okada</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kiuchi</surname><given-names>T</given-names> </name></person-group><article-title>Performance of ChatGPT in medical licensing examinations in countries worldwide: a systematic review and meta-analysis protocol</article-title><source>PLoS ONE</source><year>2024</year><volume>19</volume><issue>10</issue><fpage>e0312771</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0312771</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wandel</surname><given-names>S</given-names> </name><name name-style="western"><surname>J&#x00FC;ni</surname><given-names>P</given-names> </name><name name-style="western"><surname>Tendal</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Effects of glucosamine, chondroitin, or placebo in patients with osteoarthritis of hip or knee: network meta-analysis</article-title><source>BMJ</source><year>2010</year><month>09</month><day>16</day><volume>341</volume><fpage>c4675</fpage><pub-id pub-id-type="doi">10.1136/bmj.c4675</pub-id><pub-id pub-id-type="medline">20847017</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Woo</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Song</surname><given-names>GG</given-names> </name></person-group><article-title>Effect of glucosamine or chondroitin sulfate on the osteoarthritis progression: a meta-analysis</article-title><source>Rheumatol Int</source><year>2010</year><month>01</month><volume>30</volume><issue>3</issue><fpage>357</fpage><lpage>363</lpage><pub-id pub-id-type="doi">10.1007/s00296-009-0969-5</pub-id><pub-id pub-id-type="medline">19544061</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al Khaja</surname><given-names>KAJ</given-names> </name><name name-style="western"><surname>AlKhaja</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Sequeira</surname><given-names>RP</given-names> </name></person-group><article-title>Drug information, misinformation, and disinformation on social media: a content analysis study</article-title><source>J Public Health Policy</source><year>2018</year><month>08</month><volume>39</volume><issue>3</issue><fpage>343</fpage><lpage>357</lpage><pub-id pub-id-type="doi">10.1057/s41271-018-0131-2</pub-id><pub-id pub-id-type="medline">29795521</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>AI-powered Bing chat gains three distinct personalities</article-title><source>ArsTechnica</source><access-date>2025-02-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://arstechnica.com/information-technology/2023/03/microsoft-equips-bing-chat-with-multiple-personalities-creative-balanced-precise/">https://arstechnica.com/information-technology/2023/03/microsoft-equips-bing-chat-with-multiple-personalities-creative-balanced-precise/</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Are dietary supplements safe?</article-title><source>American Cancer Society</source><access-date>2025-10-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cancer.org/cancer/managing-cancer/treatment-types/complementary-and-integrative-medicine/dietary-supplements/safety.html">https://www.cancer.org/cancer/managing-cancer/treatment-types/complementary-and-integrative-medicine/dietary-supplements/safety.html</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><article-title>Popular diets, supplements, and cancer</article-title><source>National Cancer Institute</source><access-date>2025-02-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cancer.gov/about-cancer/treatment/cam/diets-supplements">https://www.cancer.gov/about-cancer/treatment/cam/diets-supplements</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mart&#x00ED;nez</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Jacobs</surname><given-names>ET</given-names> </name><name name-style="western"><surname>Baron</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Byers</surname><given-names>T</given-names> </name></person-group><article-title>Dietary supplements and cancer prevention: balancing potential benefits against proven harms</article-title><source>J Natl Cancer Inst</source><year>2012</year><month>05</month><day>16</day><volume>104</volume><issue>10</issue><fpage>732</fpage><lpage>739</lpage><pub-id pub-id-type="doi">10.1093/jnci/djs195</pub-id><pub-id pub-id-type="medline">22534785</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yokota</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shirabe</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Japanese newspaper advertisements for dietary supplements before and after COVID-19: a content analysis</article-title><source>BMJ Open</source><year>2021</year><month>11</month><day>23</day><volume>11</volume><issue>11</issue><fpage>e050898</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2021-050898</pub-id><pub-id pub-id-type="medline">34815281</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ringer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Luko&#x0161;i&#x016B;t&#x0117;</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Discovering language model behaviors with model-written evaluations</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 19, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2212.09251</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ton</surname><given-names>JF</given-names> </name><etal/></person-group><article-title>Trustworthy LLMs: a survey and guideline for evaluating large language models&#x2019; alignment</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.05374</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cai</surname><given-names>LZ</given-names> </name><name name-style="western"><surname>Shaheen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of generative large language models on ophthalmology board-style questions</article-title><source>Am J Ophthalmol</source><year>2023</year><month>10</month><volume>254</volume><fpage>141</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1016/j.ajo.2023.05.024</pub-id><pub-id pub-id-type="medline">37339728</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Umapathi</surname><given-names>LK</given-names> </name><name name-style="western"><surname>Sankarasubbu</surname><given-names>M</given-names> </name></person-group><article-title>Med-halt: medical domain hallucination test for large language models</article-title><conf-name>Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)</conf-name><conf-date>Dec 6-7, 2023</conf-date><conf-loc>Singapore</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2023.conll-1.21</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Oniani</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shao</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>A scoping review of artificial intelligence for precision nutrition</article-title><source>Adv Nutr</source><year>2025</year><month>04</month><volume>16</volume><issue>4</issue><fpage>100398</fpage><pub-id pub-id-type="doi">10.1016/j.advnut.2025.100398</pub-id><pub-id pub-id-type="medline">40024275</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>N</given-names> </name><name name-style="western"><surname>Frieske</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Survey of hallucination in natural language generation</article-title><source>ACM Comput Surv</source><year>2023</year><month>12</month><day>31</day><volume>55</volume><issue>12</issue><fpage>1</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1145/3571730</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>SSY</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>QV</given-names> </name><name name-style="western"><surname>Vorvoreanu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ballard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vaughan</surname><given-names>JW</given-names> </name></person-group><article-title>&#x201C;I&#x2019;m not sure, but...&#x201D;: examining the impact of large language models&#x2019; uncertainty expression on user reliance and trust</article-title><access-date>2025-10-12</access-date><conf-name>Proceedings of the 2024 ACM Conference on Fairness, Accountability, and Transparency (FAccT &#x2019;24)</conf-name><conf-date>Jun 3-6, 2024</conf-date><conf-loc>Rio de Janeiro, Brazil</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3630106">https://dl.acm.org/doi/proceedings/10.1145/3630106</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>An example of effectiveness reports on dietary supplements for various body systems published by the National Institute of Health and Nutrition in Japan.</p><media xlink:href="ai_v4i1e78436_app1.docx" xlink:title="DOCX File, 481 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Evaluation framework for evidence-based dietary supplement effects.</p><media xlink:href="ai_v4i1e78436_app2.docx" xlink:title="DOCX File, 25 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Evidence-based effects of 30 dietary supplements for 6 diseases.</p><media xlink:href="ai_v4i1e78436_app3.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Categories of the sources cited by Copilot.</p><media xlink:href="ai_v4i1e78436_app4.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material></app-group></back></article>