<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v5i1e84322</article-id>
      <article-id pub-id-type="pmid">41672474</article-id>
      <article-id pub-id-type="doi">10.2196/84322</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluation of Large Language Models for Peer Review in Transplantation Research: Algorithm Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Liu</surname>
            <given-names>Hongfang</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Shinde</surname>
            <given-names>Ishana Vikram</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gupta</surname>
            <given-names>Anup</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Grosser</surname>
            <given-names>John</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Shen</surname>
            <given-names>Selena Ming</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-9073-326X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Zifu</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Center for Geographic Analysis</institution>
            <institution>Faculty of Arts and Sciences</institution>
            <institution>Harvard University</institution>
            <addr-line>1737 Cambridge Street</addr-line>
            <addr-line>Cambridge, MA, 02138</addr-line>
            <country>United States</country>
            <phone>1 5714855387</phone>
            <email>zifu_wang@fas.harvard.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7183-5166</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>Krittika</given-names>
          </name>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-5022-0701</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Meng-Hao</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2051-3690</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Xiao</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4323-382X</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Koizumi</surname>
            <given-names>Naoru</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8722-0898</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Pine View School</institution>
        <addr-line>Osprey, FL</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Center for Geographic Analysis</institution>
        <institution>Faculty of Arts and Sciences</institution>
        <institution>Harvard University</institution>
        <addr-line>Cambridge, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Monta Vista High School</institution>
        <addr-line>Cupertino, CA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Center for Biomedical Science and Policy</institution>
        <institution>Schar School of Policy and Government</institution>
        <institution>George Mason University</institution>
        <addr-line>Fairfax, VA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Environmental Sciences</institution>
        <institution>College of Arts and Sciences</institution>
        <institution>Emory University</institution>
        <addr-line>DeKalb, GA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Zifu Wang <email>zifu_wang@fas.harvard.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>11</day>
        <month>2</month>
        <year>2026</year>
      </pub-date>
      <volume>5</volume>
      <elocation-id>e84322</elocation-id>
      <history>
        <date date-type="received">
          <day>24</day>
          <month>9</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>26</day>
          <month>11</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>17</day>
          <month>12</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>5</day>
          <month>1</month>
          <year>2026</year>
        </date>
      </history>
      <copyright-statement>©Selena Ming Shen, Zifu Wang, Krittika Paul, Meng-Hao Li, Xiao Huang, Naoru Koizumi. Originally published in JMIR AI (https://ai.jmir.org), 11.02.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2026/1/e84322" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Peer review remains central to ensuring research quality, yet it is constrained by reviewer fatigue and human bias. The rapid rise in scientific publishing has worsened these challenges, prompting interest in whether large language models (LLMs) can support or improve the peer review process.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to address critical gaps in the use of LLMs for peer review of papers in the field of organ transplantation by (1) comparing the performance of 5 recent open-source LLMs; (2) evaluating the impact of author affiliations—prestigious, less prestigious, and none—on LLM review outcomes; and (3) examining the influence of prompt engineering strategies, including zero-shot prompting, few-shot prompting, tree of thoughts (ToT) prompting, and retrieval-augmented generation (RAG), on review decisions.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A dataset of 200 transplantation papers published between 2024 and 2025 across 4 journal quartiles was evaluated using 5 state-of-the-art open-source LLMs (Llama 3.3, Mistral 7B, Gemma 2, DeepSeek r1-distill Qwen, and Qwen 2.5). The 4 prompting techniques (zero-shot prompting, few-shot prompting, ToT prompting, and RAG) were tested under multiple temperature settings. Models were instructed to categorize papers into quartiles. To assess fairness, each paper was evaluated 3 times: with no affiliation, a prestigious affiliation, and a less prestigious affiliation. Accuracy, decisions, runtime, and computing resource use were recorded. Chi-square tests and adjusted Pearson residuals were used to examine the presence of affiliation bias.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>RAG with a temperature of 0.5 achieved the best overall performance (exact match accuracy: 0.35; loose match accuracy: 0.78). Across all models, LLMs frequently assigned manuscripts to quartile 2 and quartile 3 while avoiding extreme quartiles (quartile 1 and quartile 4). None of the models demonstrated affiliation bias, though Gemma 2 (<italic>P</italic>=.08) and Qwen 2.5 (<italic>P</italic>=.054) were substantially biased. Each model displayed unique “personalities” in quartile predictions, influencing consistency. Mistral had the highest exact match accuracy (0.35) despite having both the lowest average runtime (1246.378 seconds) and computing resource use (7 billion parameters). While accuracy was insufficient for independent review, LLMs showed value in supporting preliminary triage tasks.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Current open-source LLMs are not reliable enough to replace human peer reviewers. The largely absent affiliation bias suggests potential advantages in fairness, but these benefits do not offset the low decision accuracy. Mistral demonstrated the greatest accuracy and computational efficiency, and RAG with a moderate temperature emerged as the most effective prompting strategy. If LLMs are used to assist in peer review, their outputs require nonnegotiable human supervision to ensure correct judgment and appropriate editorial decisions.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>peer review</kwd>
        <kwd>large language models</kwd>
        <kwd>transplantation</kwd>
        <kwd>bias</kwd>
        <kwd>prompt engineering</kwd>
        <kwd>retrieval-augmented generation</kwd>
        <kwd>scholarly publishing</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Peer review is perceived as essential to the assurance of research quality and legitimacy [<xref ref-type="bibr" rid="ref1">1</xref>], but there is a growing body of literature that recognizes its shortcomings. One of the greatest challenges is reviewer fatigue: while publications have surged exponentially [<xref ref-type="bibr" rid="ref2">2</xref>], the reviewer pool has not kept pace, leaving reviewers overburdened, unrecognized, and unpaid [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Another major issue is reviewer bias. Humans are inherently biased; their life experiences, thinking styles, workload pressures, emotional state, and cognitive capacity can impact a paper’s acceptance decision [<xref ref-type="bibr" rid="ref1">1</xref>]. Previous research has established that affiliation bias, the tendency to perceive manuscripts from renowned authors as more accurate, affects human review when it is not double-blinded [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. The industry’s paradigm of human peer review is untenable due to the current model’s unsustainability and human reviewers’ inherent inconsistency. These 2 issues underscore the dire need to restructure the peer review process.</p>
        <p>A much-debated question is whether generative artificial intelligence (AI) can help address the persistent challenges of peer review [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. Large language models (LLMs) are able to complete a myriad of natural language processing tasks and have been extensively applied across medicine [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref15">15</xref>] and scientific research [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. These capabilities position LLMs to significantly reduce the burden on human reviewers. A study by Tran et al [<xref ref-type="bibr" rid="ref24">24</xref>] estimated that LLMs could reduce the peer review workload by 65%. Additionally, because LLMs lack personal motives or connections, they may help mitigate human bias in the peer review process [<xref ref-type="bibr" rid="ref25">25</xref>]. Overall, current evidence suggests that generative AI may play a critical role in the future of scholarly publishing.</p>
        <p>However, the integration of LLMs into peer review must be approached with utmost caution. Their participation in peer review has drawn heavy scrutiny due to known limitations, including factual inaccuracies, outdated content [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], and difficulties in upholding rigorous academic standards [<xref ref-type="bibr" rid="ref28">28</xref>]. Another major concern with LLMs is their propensity to amplify historical biases. Stokel-Walker and Van Noorden [<xref ref-type="bibr" rid="ref29">29</xref>] explained that “this unreliability is baked into how LLMs are built,” as they are trained on enormous datasets that include misinformation, outdated knowledge, and societal biases. This approach facilitates task-specific fine-tuning but also risks propagating harmful biases, including stereotypes and misrepresentations, that disproportionately affect communities considered marginalized [<xref ref-type="bibr" rid="ref30">30</xref>]. Notably, the presence of affiliation bias, the tendency to perceive manuscripts from renowned authors as more accurate, in LLMs is under-studied despite its great relevance to peer review.</p>
        <p>The primary research question guiding this study is as follows: Can current open LLMs reliably and fairly predict the prestige tier of the likely publication venue for a given transplantation manuscript? To address this question, this study identified the optimal combination of prompt engineering techniques and temperature settings for quartile prediction and compared LLMs in terms of decision accuracy, fairness, and runtime. To assess LLMs’ performance on specialized content—a known limitation of LLMs [<xref ref-type="bibr" rid="ref29">29</xref>]—we used exclusively transplantation papers, a relatively small and focused research field [<xref ref-type="bibr" rid="ref31">31</xref>]. Finally, we investigated the presence of affiliation bias in LLMs using chi-square tests for independence and adjusted Pearson residuals.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <sec>
          <title>Promise of LLMs in Peer Review</title>
          <p>Debate continues about whether LLMs are capable of supporting peer review. Conroy [<xref ref-type="bibr" rid="ref32">32</xref>] argues that “the naive act of asking an LLM directly to review a manuscript is likely to produce little value beyond summaries and copy-editing suggestions.” However, empirical studies show that LLMs are already being adopted in practice and may offer meaningful benefits. Liang et al [<xref ref-type="bibr" rid="ref23">23</xref>] uncovered that up to 17% of recent AI conference peer reviews were written by LLMs [<xref ref-type="bibr" rid="ref33">33</xref>].</p>
          <p>In a different study, Liang et al [<xref ref-type="bibr" rid="ref34">34</xref>] found that over half of users rated GPT-4–generated feedback as helpful or very helpful, and 82.4% rated it more beneficial than feedback from at least some human reviewers. In the same vein, Thakkar et al [<xref ref-type="bibr" rid="ref10">10</xref>] found that LLM-generated review feedback was more specific and actionable, enhancing peer review quality. Beyond generating feedback, LLMs have been used to evaluate the human peer review process and successfully identify reviewers’ biases, such as affiliation, anchoring, and gender biases [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. These findings suggest that LLMs could support certain fairness-oriented tasks within the review pipeline.</p>
          <p>Despite the emerging relevance, there is little published data on the capabilities of current open-source LLMs in peer review. Additionally, no previous study of LLM-conducted peer review has attempted to compare different prompt engineering techniques, even though prompt type can significantly impact LLM efficacy for a given task [<xref ref-type="bibr" rid="ref36">36</xref>]. Moreover, few studies have investigated LLMs in relatively low-volume research areas, such as transplantation [<xref ref-type="bibr" rid="ref31">31</xref>], where limited available data may impair LLM performance. These are critical knowledge gaps, as considerable evidence points to an already widespread use of LLMs in peer review workflows [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref37">37</xref>].</p>
        </sec>
        <sec>
          <title>Affiliation Bias of LLMs in Peer Review</title>
          <p>The academic literature on peer review has revealed the presence of affiliation bias in open peer review [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Biases are often rooted in reviewers’ personal experiences, connections, and beliefs, suggesting LLMs could potentially mitigate them in peer review. However, there are relatively few historical studies in the area of affiliation bias in LLMs in peer review. A study by von Wedel et al [<xref ref-type="bibr" rid="ref25">25</xref>] offers a detailed analysis of affiliation bias in LLM peer review. Thirty preprint abstracts were combined with 30 affiliations and were provided to OpenAI’s GPT-3.5 for acceptance or rejection. The study found that higher-tiered affiliations were marginally associated with higher acceptance rates. Strikingly, differences during LLM peer review appeared to be smaller than those in previous reports on human affiliation bias (1.7% difference in LLM acceptance rates vs 12.5% difference in human acceptance rates), suggesting that LLMs reduce affiliation bias to a negligible amount.</p>
          <p>However, the generalizability of this research was significantly constrained by several methodological limitations. The evaluation relied on a single LLM, raising questions about the applicability of the findings to the numerous available models. Furthermore, the token limit, which only allowed review of abstracts, likely diminished the relevance of the results to real-world scenarios involving full manuscript review. Finally, the dichotomous decision of acceptance or rejection did not consider the nuances of the scientific publishing ecosystem, particularly variations based on journal prestige.</p>
        </sec>
      </sec>
      <sec>
        <title>Current Challenges and Contributions of This Study</title>
        <p>Under-studied aspects of LLM use in paper review include (1) comparison of the most recent open-source LLMs, (2) effects of different prompt engineering techniques on LLM decisions, (3) LLM review in the specialized field of transplantation, and (4) amplification of affiliation bias.</p>
        <p>To address the current challenges with LLMs, this study concentrated on (1) assessing similarities and differences between several influential recent open-source LLMs [<xref ref-type="bibr" rid="ref39">39</xref>] (eg, Meta’s Llama 3.3, Mistral AI’s Mistral, Google’s Gemma 2, DeepSeek r1-distill Qwen, and Alibaba’s Qwen 2.5); (2) assessing the effect of affiliations of varying prestige on the LLMs’ output; and (3) evaluating LLM accuracy using several common prompt engineering techniques (eg, zero-shot prompting, few-shot prompting, tree of thoughts [ToT] prompting, and retrieval-augmented generation [RAG]).</p>
      </sec>
      <sec>
        <title>Data Sources</title>
        <p>The data collected and used in this study included journal articles published in the field of transplantation between 2024 and 2025. The journals were gathered from SCImago Journal and Country Rank and were separated into 4 quartiles based on rankings; each quartile represent the 25% of journals, with the first representing the top 25% of journals and the fourth representing the bottom 25% of journals [<xref ref-type="bibr" rid="ref40">40</xref>]. Journal quartiles serve as tools to assess the quality and impact of academic journals [<xref ref-type="bibr" rid="ref41">41</xref>].</p>
        <p>A total of 200 papers were gathered, with 50 (25%) articles from each quartile. Each dataset entry included the attributes title, authors, publication date, journal name, journal abstract, full paper, and respective quartile (refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for example data). The quartile and journal name were hidden when the data were input into the LLM but were later compared with LLM decisions to determine model accuracy.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>After collecting 200 recent transplant publications, each paper was processed using 4 temperatures and 4 methods: zero-shot prompting, few-shot prompting, ToT prompting, and RAG. To make a more nuanced assessment of LLM decisions, the LLMs were given 4 options (quartile 1 [Q1], quartile 2 [Q2], quartile 3 [Q3], and quartile 4 [Q4]) rather than just 2 (acceptance vs rejection). Thus, all prompting methods were tuned with prompts to assign papers to journal quartiles. RAG was implemented using the open-source library Facebook AI Similarity Search, which was used to create a vector database for each paper [<xref ref-type="bibr" rid="ref42">42</xref>]. The first round of testing was conducted using Llama 3.3 and 80 (40%) randomly sampled papers to identify a prompt-temperature combination that produced the highest accuracy (refer to <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for full prompts). This step served as a pilot hyperparameter search to determine optimal evaluation conditions. The same settings were then applied in round 2 testing, in which 5 open-source LLMs evaluated 200 (100%) papers across 3 trials: no affiliation, a prestigious affiliation, and a less prestigious affiliation. Finally, a chi-square test for independence was performed to detect whether there is an association between perceived affiliation and journal quartile. Effect sizes for associations were quantified using Cramer V, with 95% CIs calculated via nonparametric bootstrapping (5000 resamples). Accuracy scores, fairness, runtime, and computing resource use were used to compare the LLMs. This process is illustrated in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <p>Journal quartiles were used as the prediction target. Although journal-level metrics are imperfect indicators of article-level quality, prior research consistently shows that they capture meaningful, though modest, signals. Thelwall et al [<xref ref-type="bibr" rid="ref43">43</xref>] found that the correlation between article quality and journal impact was positive, with correlations around 0.4 in medicine. This aligns with broader bibliometric evidence that journals accumulate prestige largely because they tend to publish higher-quality or more influential work [<xref ref-type="bibr" rid="ref44">44</xref>].</p>
        <p>Given these empirical associations, quartiles offer a practical and reproducible proxy for the relative quality of a manuscript. Due to the lack of access to rejected or under review manuscripts, a binary accept-reject framework would have been uninformative; all papers in our dataset had already been accepted. Quartile prediction provided a more discriminative and challenging task, allowed the detection of model tendencies, and offered a standardized target that enabled controlled comparison across prompting strategies.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Workflow of investigating the presence of affiliation bias and capabilities for peer review in large language models (LLMs). RAG: retrieval-augmented generation; ToT: tree of thoughts.</p>
          </caption>
          <graphic xlink:href="ai_v5i1e84322_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Methods Configuration</title>
        <p>Refer to <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for specific models tested, prompt templates used, and the RAG methodology.</p>
        <sec>
          <title>Zero-Shot Prompting</title>
          <p>In the zero-shot prompting configuration, the model was instructed to assign the given papers to journal quartiles without any additional examples to steer it.</p>
        </sec>
        <sec>
          <title>Few-Shot Prompting</title>
          <p>Few-shot prompting is a technique that enables in-context learning using demonstrations in the prompt to steer the model to better performance [<xref ref-type="bibr" rid="ref45">45</xref>]. In the few-shot prompting configuration, an example system prompt, a provided paper, and an example response were provided. The system prompt clearly instructed the LLM to categorize given papers “in a consistent style.”</p>
        </sec>
        <sec>
          <title>ToT Prompting</title>
          <p>In ToT prompting, the LLM was encouraged to maintain a literal “tree of thoughts,” where thoughts represent coherent language sequences that serve as intermediate steps toward solving a problem. The LLM self-evaluated the progress made toward solving a problem through a deliberate reasoning process [<xref ref-type="bibr" rid="ref45">45</xref>].</p>
        </sec>
        <sec>
          <title>RAG Approach</title>
          <p>RAG addressed LLM challenges, such as hallucination and outmoded knowledge, by retrieving external knowledge sources to complete tasks in addition to LLM’s static dataset. This was done through a built-in retrieval component that feeds relevant documents along with the prompt to the LLM [<xref ref-type="bibr" rid="ref45">45</xref>]. The LLM was effectively fine-tuned to peer review without the need to retrain. RAG was the only prompting strategy capable of ingesting full papers; all other methods were restricted to abstracts due to token limitations.</p>
        </sec>
        <sec>
          <title>Temperature</title>
          <p>The temperature hyperparameter of an LLM regulates the amount of randomness in its response. On a scale of 0 to 1, a higher temperature results in more diverse or novel outputs, while a lower temperature results in more predictable or less creative outputs. A systematic grid search over 4 temperatures (0, 0.1, 0.5, and 1.0) was conducted with each prompting strategy. These values were selected to span the practical range from near-deterministic (0, 0.1) to moderately stochastic (0.5) to maximally stochastic (1.0), enabling observation of whether classification performance improved with more or less sampling diversity.</p>
        </sec>
      </sec>
      <sec>
        <title>LLM Comparison Metrics</title>
        <p>To evaluate the performance of the LLMs in peer review, accuracy scores were calculated. Because LLMs have been found to give passable decisions but rarely give completely correct decisions [<xref ref-type="bibr" rid="ref26">26</xref>], both “exact match” and “loose match” accuracies were calculated. Exact match accuracy is based on completely correct predictions of journal quartiles. For loose match accuracy, a prediction within 1 quartile of the input paper’s true quartile was considered correct. Finally, accuracy breakdowns per journal quartile were calculated to allow for further analysis of LLM behaviors. In addition, runtime and computing resource cost were compared to provide further insights into whether LLM performance was related to the amount of resources consumed.</p>
      </sec>
      <sec>
        <title>Fairness Evaluation</title>
        <p>Each of the 5 LLMs evaluated the 200 collected transplantation papers 3 times: with no affiliation, artificial high-tier affiliation, and artificial low-tier affiliation. The high-tier and low-tier affiliations were chosen based on the Webometrics University Ranking of the number of citations amassed by research institutions in the last 6 years, with the National Institute of Health ranking first and the Walter Reed Army Institute ranking last. The website ceased to function after experimentation was completed, so it is not accessible, although a preprint describing it is available [<xref ref-type="bibr" rid="ref46">46</xref>]. They were used to determine prestigious and less prestigious affiliations, respectively. A chi-square test for independence was used to test for association between affiliation and quartile. To quantify the strength of any observed association, effect sizes were calculated using Cramer V, with 95% CIs derived via nonparametric bootstrapping (5000 resamples). Results were considered statistically significant at an α level of .05. Adjusted Pearson residuals were also calculated, with residuals with an absolute value greater than 1.96 considered statistically significant [<xref ref-type="bibr" rid="ref47">47</xref>]. Combined residuals for the top 2 and bottom 2 quartiles were also calculated for better interpretability, considering Q1 and Q2 as high-tier decisions and Q3 and Q4 as low-tier decisions.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This research did not require institutional review board approval because it did not meet the regulatory definition of human subjects research. The analysis was limited to publicly available literature and did not involve human participants, patient data, or identifiable personal information. According to 45 Code of Federal Regulations Part 46, such activities fall outside the scope of mandatory institutional review board approval.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Optimal Configuration</title>
        <p>As shown in <xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure2" ref-type="fig">Figure 2</xref>, the combination of RAG and a temperature of 0.5 yielded the highest exact match and loose match accuracies—0.35 and 0.775, respectively. RAG offered the most diversity in quartile predictions, while the other 3 strategies had 0% Q3 and Q4 accuracies. The zero-shot strategy resulted in the longest runtime and an overprediction of Q1. Few-shot prompting generally resulted in Q1 and Q2 decisions, though the results changed drastically with different temperatures, achieving the lowest exact match accuracy in <xref ref-type="table" rid="table1">Table 1</xref> (0.2)—performing worse than random guessing. ToT drastically decreased LLM runtime and resulted in 100% Q2 accuracy, suggesting that the use of intermediate reasoning steps biased the model toward the neutral Q2 decision. Overall, exact match accuracies were extremely low. The difference in accuracies between Q1-Q2 and Q3-Q4 suggests the LLMs predicted the top 2 quartiles more frequently than the bottom 2, which corroborates previous studies’ findings that LLMs tend to inflate acceptance results [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>].</p>
        <p>These results illustrate the enhanced proficiency of RAG in peer review compared with other prompting methods. RAG possesses several advantages: retrieval of relevant information to improve model accuracy, greater response diversity, and a large context window that allows the LLM to read full papers [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. RAG was the only prompting strategy capable of ingesting full papers, whereas all other methods were restricted to abstracts due to token limitations. Although this input-length asymmetry likely contributed to RAG’s superior performance, it also mirrors real-world deployment constraints, where many LLMs cannot natively process long scientific texts without retrieval augmentation. Thus, the comparison reflects each method’s practically usable form rather than an artificially equalized setting. The fundamental benefits RAG possesses over the other methods allow greater generalizability to the peer review process.</p>
        <p>Interestingly, the most suitable temperature was 0.5 rather than lower temperatures, which were initially considered more suitable for the objective, fact-based peer review process. The temperature setting of 0.5 may strike a favorable balance between the objectivity and creativity required for review tasks.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Comparative analysis of various large language model prompt engineering techniques under different temperature hyperparameters.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="120"/>
            <col width="130"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="0"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Runtime (seconds)</td>
                <td>Exact match accuracy (95% CI)</td>
                <td>Loose match accuracy</td>
                <td>Q1<sup>a</sup> accuracy</td>
                <td>Q2<sup>b</sup> accuracy</td>
                <td>Q3<sup>c</sup> accuracy</td>
                <td colspan="2">Q4<sup>d</sup> accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="10">
                  <bold>RAG<sup>e</sup></bold>
                  <bold>with zero-shot prompting</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=1.0</td>
                <td>1106.869</td>
                <td>0.3 (0.21-0.41)</td>
                <td>0.7</td>
                <td>0.4</td>
                <td>0.6</td>
                <td>0.2</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.5</td>
                <td>1089.631</td>
                <td><italic>0.35</italic><sup>f</sup> (0.25-0.46)</td>
                <td>
                  <italic>0.775</italic>
                  <sup>f</sup>
                </td>
                <td>0.5</td>
                <td>0.7</td>
                <td>0.2</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.1</td>
                <td>1109.433</td>
                <td>0.325 (0.23-0.43)</td>
                <td>0.3</td>
                <td>0.4</td>
                <td>0.7</td>
                <td>0.2</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0</td>
                <td>1107.765</td>
                <td>0.3 (0.21-0.41)</td>
                <td>0.275</td>
                <td>0.4</td>
                <td>0.6</td>
                <td>0.2</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Zero-shot prompting</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=1.0</td>
                <td>1821.130</td>
                <td>0.275 (0.19-0.38)</td>
                <td>0.65</td>
                <td>0.8</td>
                <td>0.3</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.5</td>
                <td>1814.739</td>
                <td>0.325 (0.23-0.41)</td>
                <td>0.675</td>
                <td>0.9</td>
                <td>0.4</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.1</td>
                <td>1824.464</td>
                <td>0.3 (0.21-0.41)</td>
                <td>0.625</td>
                <td>0.8</td>
                <td>0.4</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0</td>
                <td>1832.114</td>
                <td>0.3 (0.21-0.41)</td>
                <td>0.625</td>
                <td>0.8</td>
                <td>0.4</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Few-shot prompting</bold>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=1.0</td>
                <td>1743.013</td>
                <td>0.25 (0.20-0.31)</td>
                <td>0.65</td>
                <td>0.6</td>
                <td>0.4</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.5</td>
                <td>1737.973</td>
                <td>0.325 (0.23-0.43)</td>
                <td>0.624</td>
                <td>0.9</td>
                <td>0.4</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.1</td>
                <td>1729.847</td>
                <td>0.2 (0.15-0.26)</td>
                <td>0.625</td>
                <td>0.4</td>
                <td>0.4</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0</td>
                <td>1755.463</td>
                <td>0.275 (0.19-0.38)</td>
                <td>0.6</td>
                <td>0.7</td>
                <td>0.4</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td colspan="10">
                  <bold>Tree of thoughts prompting</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=1.0</td>
                <td>615.322</td>
                <td>0.3 (0.21-0.41)</td>
                <td>0.75</td>
                <td>0.2</td>
                <td>1.0</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.5</td>
                <td>596.043</td>
                <td>0.275 (0.19-0.38)</td>
                <td>0.725</td>
                <td>0.1</td>
                <td>1.0</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0.1</td>
                <td>613.613</td>
                <td>0.25 (0.20-0.31)</td>
                <td>0.75</td>
                <td>0.0</td>
                <td>1.0</td>
                <td>0.0</td>
                <td colspan="2">0.0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Temperature=0</td>
                <td>615.121</td>
                <td>0.25 (0.20-0.31)</td>
                <td>0.75</td>
                <td>0.0</td>
                <td>1.0</td>
                <td>0.0</td>
                <td colspan="2"> 0.0</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>Q1: quartile 1.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Q2: quartile 2.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Q3: quartile 3.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>Q4: quartile 4.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>RAG: retrieval-augmented generation.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>Italics indicate the highest accuracy.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Loose match accuracies of combinations of temperature and prompt engineering techniques. RAG: retrieval-augmented generation; ToT: tree of thoughts.</p>
          </caption>
          <graphic xlink:href="ai_v5i1e84322_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Fairness</title>
        <p><xref ref-type="table" rid="table2">Table 2</xref> presents the quartile decisions across the 3 different input affiliations, along with the <italic>P</italic> values of the chi-square test for independence and the corresponding effect sizes (Cramer V) with 95% CIs. The <italic>P</italic> values of Gemma 2 and Qwen 2.5 are approximately an order of magnitude smaller than the other <italic>P</italic> values, demonstrating greater statistical significance. However, the effect sizes for all models were negligible (Cramer V≤0.10), with CIs indicating that any true association between affiliation and quartile decision was minimal.</p>
        <p>To break down the results of this test, adjusted Pearson residuals are presented in <xref ref-type="table" rid="table3">Table 3</xref>, where positive values indicate overrepresentation and negative values indicate underrepresentation relative to expected response frequencies. One unanticipated result is that Gemma significantly overpredicted the number of Q4 papers when given no affiliation. By contrast, when given no affiliation, Qwen significantly overpredicted the number of Q1 papers. Interestingly, when given a prestigious affiliation, Qwen placed more papers in Q4. These statistically significant associations did not align with affiliation bias, as that would entail overestimating Q1 and Q2 and underestimating Q3 and Q4 decisions when given prestigious affiliations.</p>
        <p>Overall, none of the LLMs exhibited affiliation bias.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Model quartile decisions across input affiliations, chi-square test for independence <italic>P</italic> values, effect sizes, and 95% CIs.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="210"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="0"/>
            <col width="80"/>
            <col width="0"/>
            <col width="0"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Models and affiliation level</td>
                <td colspan="2">Q1<sup>a</sup>, 50 (25%)</td>
                <td colspan="2">Q2<sup>b</sup>, 50 (25%)</td>
                <td colspan="2">Q3<sup>c</sup>, 50 (25%)</td>
                <td colspan="2">Q4<sup>d</sup>, 50 (25%)</td>
                <td colspan="3"><italic>P</italic> value</td>
                <td colspan="2">Cramer V (95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="12">
                  <bold>Llama 3.3-70B</bold>
                </td>
                <td colspan="3">.80</td>
                <td>0.05083 (0.047-0.127)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">37</td>
                <td colspan="2">148</td>
                <td colspan="2">15</td>
                <td colspan="2">0</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">48</td>
                <td colspan="2">139</td>
                <td colspan="2">12</td>
                <td colspan="2">1</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">41</td>
                <td colspan="2">144</td>
                <td colspan="2">14</td>
                <td colspan="2">1</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>Mistral-7B</bold>
                </td>
                <td colspan="3">.63</td>
                <td>0.04621 (0.027-0.120)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">54</td>
                <td colspan="2">101</td>
                <td colspan="2">45</td>
                <td colspan="2">0</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">55</td>
                <td colspan="2">112</td>
                <td colspan="2">33</td>
                <td colspan="2">0</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">54</td>
                <td colspan="2">109</td>
                <td colspan="2">37</td>
                <td colspan="2">0</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>Gemma 2-9B</bold>
                </td>
                <td colspan="3">
                  <italic>.08</italic>
                  <sup>e</sup>
                </td>
                <td>0.08408 (0.045-0.153)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">0</td>
                <td colspan="2">24</td>
                <td colspan="2">157</td>
                <td colspan="2">19</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">0</td>
                <td colspan="2">27</td>
                <td colspan="2">166</td>
                <td colspan="2">7</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">0</td>
                <td colspan="2">24</td>
                <td colspan="2">168</td>
                <td colspan="2">8</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>DeepSeek r1-distill Qwen-14B</bold>
                </td>
                <td colspan="3">.87</td>
                <td>0.04516 (0.041-0.123)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">3</td>
                <td colspan="2">73</td>
                <td colspan="2">113</td>
                <td colspan="2">11</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">1</td>
                <td colspan="2">83</td>
                <td colspan="2">103</td>
                <td colspan="2">13</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">3</td>
                <td colspan="2">79</td>
                <td colspan="2">106</td>
                <td colspan="2">12</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>Qwen 2.5-7B</bold>
                </td>
                <td colspan="3">
                  <italic>.05</italic>
                </td>
                <td>0.10159 (0.071-0.161)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">6</td>
                <td colspan="2">51</td>
                <td colspan="2">143</td>
                <td colspan="2">0</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">2</td>
                <td colspan="2">69</td>
                <td colspan="2">127</td>
                <td colspan="2">2</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">1</td>
                <td colspan="2">60</td>
                <td colspan="2">139</td>
                <td colspan="2">0</td>
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="3">
                  <break/>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Q1: quartile 1.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Q2: quartile 2.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>Q3: quartile 3.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>Q4: quartile 4.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>Italicization indicates relatively significant <italic>P</italic> values.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Adjusted Pearson residuals of large language model decisions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Models and affiliation level</td>
                <td colspan="2">Q1<sup>a</sup> (adjusted Pearson residuals)</td>
                <td colspan="2">Q2<sup>b</sup> (adjusted Pearson residuals)</td>
                <td colspan="2">Q3<sup>c</sup> (adjusted Pearson residuals)</td>
                <td colspan="2">Q4<sup>d</sup> (adjusted Pearson residuals)</td>
                <td colspan="2">Q1 and Q2 (adjusted Pearson residuals)</td>
                <td>Q3 and Q4 (adjusted Pearson residuals)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="14">
                  <bold>Llama 3.3-70B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">−1.063</td>
                <td colspan="2">0.8343</td>
                <td colspan="2">0.4576</td>
                <td colspan="2">−1.0017</td>
                <td colspan="2">−0.2288</td>
                <td colspan="2">−0.5440</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">11.2757</td>
                <td colspan="2">−0.8985</td>
                <td colspan="2">−0.5720</td>
                <td colspan="2">0.5008</td>
                <td colspan="2">0.3773</td>
                <td colspan="2">−0.0712</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">−0.2126</td>
                <td colspan="2">0.0642</td>
                <td colspan="2">0.1144</td>
                <td colspan="2">0.5008</td>
                <td colspan="2">−0.1484</td>
                <td colspan="2">0.6152</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Mistral-7B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">−0.0649</td>
                <td colspan="2">−1.0999</td>
                <td colspan="2">1.4668</td>
                <td colspan="2">0</td>
                <td colspan="2">−1.1648</td>
                <td colspan="2">1.4668</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">0.1298</td>
                <td colspan="2">0.8105</td>
                <td colspan="2">−1.1734</td>
                <td colspan="2">0</td>
                <td colspan="2">0.9403</td>
                <td colspan="2">−1.1734</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">−0.0649</td>
                <td colspan="2">0.2895</td>
                <td colspan="2">−0.2934</td>
                <td colspan="2">0</td>
                <td colspan="2">0.2246</td>
                <td colspan="2">−0.2934</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Gemma 2-9B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">0</td>
                <td colspan="2">−0.2619</td>
                <td colspan="2">−1.4974</td>
                <td colspan="2">
                  <italic>2.8717</italic>
                  <sup>e</sup>
                </td>
                <td colspan="2">−0.2619</td>
                <td colspan="2">1.3743</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">0</td>
                <td colspan="2">0.5237</td>
                <td colspan="2">0.5240</td>
                <td colspan="2">−1.6231</td>
                <td colspan="2">0.5237</td>
                <td colspan="2">−1.0991</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">0</td>
                <td colspan="2">−0.2619</td>
                <td colspan="2">0.9732</td>
                <td colspan="2">−1.2486</td>
                <td colspan="2">−0.2619</td>
                <td colspan="2">−0.2753</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>DeepSeek r1-distill Qwen-14B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">0.5377</td>
                <td colspan="2">−0.9462</td>
                <td colspan="2">0.9841</td>
                <td colspan="2">−0.3647</td>
                <td colspan="2">−0.4086</td>
                <td colspan="2">0.6195</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">−1.0753</td>
                <td colspan="2">0.8280</td>
                <td colspan="2">−0.7526</td>
                <td colspan="2">0.3647</td>
                <td colspan="2">−0.2474</td>
                <td colspan="2">−0.3879</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">0.5377</td>
                <td colspan="2">0.1183</td>
                <td colspan="2">−0.2316</td>
                <td colspan="2">0</td>
                <td colspan="2">0.6559</td>
                <td colspan="2">−0.2316</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Qwen 2.5-7B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>None</td>
                <td colspan="2">
                  <italic>2.1374</italic>
                </td>
                <td colspan="2">−1.7008</td>
                <td colspan="2">1.2394</td>
                <td colspan="2">−1.0017</td>
                <td colspan="2">0.4366</td>
                <td colspan="2">0.2377</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>High tier</td>
                <td colspan="2">−0.7125</td>
                <td colspan="2">1.7008</td>
                <td colspan="2">−1.7352</td>
                <td colspan="2">
                  <italic>2.0033</italic>
                </td>
                <td colspan="2">0.9884</td>
                <td colspan="2">0.2682</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Low tier</td>
                <td colspan="2">−1.4249</td>
                <td colspan="2">0</td>
                <td colspan="2">0.4958</td>
                <td colspan="2">−1.0017</td>
                <td colspan="2">−1.4249</td>
                <td colspan="2">−0.5059</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Q1: quartile 1.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Q2: quartile 2.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>Q3: quartile 3.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>Q4: quartile 4.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>Italicization indicates statistically significant residuals.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Comparisons</title>
        <p>In the second round of testing, Mistral 7B had the highest exact match accuracy, and Qwen 2.5 had the highest loose match accuracy among the LLMs (<xref ref-type="table" rid="table4">Table 4</xref>). Notably, each LLM had a unique “personality” or preference for the quartiles in which it placed papers. As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, Llama 3.3 preferred Q2 followed by Q1, Mistral preferred Q2 closely followed by Q1, Gemma preferred Q3 followed by Q2, DeepSeek preferred Q3 followed by Q2, and Qwen preferred Q3 followed by Q2. The LLMs exhibited low overall accuracy, which was expected, as they were not trained for technical topics [<xref ref-type="bibr" rid="ref17">17</xref>]. A previous study found that LLMs tend to be biased toward technical excellence over the novelty of submitted experiments [<xref ref-type="bibr" rid="ref51">51</xref>]. Interestingly, the LLMs tended to avoid placing papers in either extreme quartile (Q1 or Q4).</p>
        <p>Comparisons of LLM runtimes and sizes are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. Qwen 2.5 had the greatest average runtime by far (7248.741 seconds), though it also had the greatest loose match accuracy. Llama 3.3 had the second greatest average runtime (1325.805 seconds) and the largest size by a significant margin (70B). Mistral had the lowest average runtime (1246.378 seconds) and the smallest size (7B), yet it impressively achieved the highest exact match accuracy.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Runtime and accuracy of model predictions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="70"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="120"/>
            <col width="0"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Run</td>
                <td colspan="2">Runtime (seconds)</td>
                <td colspan="2">Exact match accuracy (95% CI)</td>
                <td colspan="2">Loose match accuracy</td>
                <td colspan="2">Q1<sup>a</sup> accuracy</td>
                <td colspan="2">Q2<sup>b</sup> accuracy</td>
                <td colspan="2">Q3<sup>c</sup> accuracy</td>
                <td>Q4<sup>d</sup> accuracy</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="16">
                  <bold>Llama 3.3-70B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">1331.919</td>
                <td colspan="2">0.275 (0.227-0.297)</td>
                <td colspan="2">0.755</td>
                <td colspan="2">0.16</td>
                <td colspan="2">0.86</td>
                <td colspan="2">0.08</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">1318.557</td>
                <td colspan="2">0.25 (0.227-0.297)</td>
                <td colspan="2">0.77</td>
                <td colspan="2">0.22</td>
                <td colspan="2">0.76</td>
                <td colspan="2">0.02</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">1326.938</td>
                <td colspan="2">0.255 (0.227-0.297)</td>
                <td colspan="2">0.765</td>
                <td colspan="2">0.26</td>
                <td colspan="2">0.74</td>
                <td colspan="2">0.02</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td colspan="16">
                  <bold>Mistral-7B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">1245.114</td>
                <td colspan="2">0.295 (0.284-0.358)</td>
                <td colspan="2">0.785</td>
                <td colspan="2">0.46</td>
                <td colspan="2">0.52</td>
                <td colspan="2">0.2</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">1249.270</td>
                <td colspan="2">
                  <italic>0.35 (0.284-0.358)</italic>
                  <sup>e</sup>
                </td>
                <td colspan="2">0.775</td>
                <td colspan="2">0.52</td>
                <td colspan="2">0.64</td>
                <td colspan="2">0.24</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">1244.751</td>
                <td colspan="2">0.315 (0.284-0.358)</td>
                <td colspan="2">0.76</td>
                <td colspan="2">0.38</td>
                <td colspan="2">0.64</td>
                <td colspan="2">0.24</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td colspan="16">
                  <bold>Gemma 2-9B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">1250.046</td>
                <td colspan="2">0.255 (0.257-0.329)</td>
                <td colspan="2">0.77</td>
                <td colspan="2">0.06</td>
                <td colspan="2">0.26</td>
                <td colspan="2">0.7</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">1439.940</td>
                <td colspan="2">0.315 (0.257-0.329)</td>
                <td colspan="2">0.82</td>
                <td colspan="2">0</td>
                <td colspan="2">0.46</td>
                <td colspan="2">0.76</td>
                <td colspan="2">0.04</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">1229.739</td>
                <td colspan="2">0.305 (0.257-0.329)</td>
                <td colspan="2">0.795</td>
                <td colspan="2">0</td>
                <td colspan="2">0.42</td>
                <td colspan="2">0.8</td>
                <td colspan="2">0</td>
              </tr>
              <tr valign="top">
                <td colspan="16">
                  <bold>DeepSeek r1–distill Qwen-14B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">1317.195</td>
                <td colspan="2">0.28 (0.252-0.324)</td>
                <td colspan="2">0.81</td>
                <td colspan="2">0</td>
                <td colspan="2">0.06</td>
                <td colspan="2">0.82</td>
                <td colspan="2">0.24</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">1309.082</td>
                <td colspan="2">0.27 (0.252-0.324)</td>
                <td colspan="2">0.815</td>
                <td colspan="2">0</td>
                <td colspan="2">0.14</td>
                <td colspan="2">0.9</td>
                <td colspan="2">0.04</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">1257.635</td>
                <td colspan="2">0.31 (0.252-0.324)</td>
                <td colspan="2">0.8</td>
                <td colspan="2">0</td>
                <td colspan="2">0.22</td>
                <td colspan="2">0.94</td>
                <td colspan="2">0.08</td>
              </tr>
              <tr valign="top">
                <td colspan="16">
                  <bold>Qwen 2.5-7B</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 1</td>
                <td colspan="2">7211.855</td>
                <td colspan="2">0.296 (0.270-0.343)</td>
                <td colspan="2">0.835</td>
                <td colspan="2">0</td>
                <td colspan="2">0.48</td>
                <td colspan="2">0.68</td>
                <td colspan="2">0.02</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 2</td>
                <td colspan="2">7302.680</td>
                <td colspan="2">0.315 (0.270-0.343)</td>
                <td colspan="2">
                  <italic>0.84</italic>
                </td>
                <td colspan="2">0</td>
                <td colspan="2">0.44</td>
                <td colspan="2">0.74</td>
                <td colspan="2">0.08</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Run 3</td>
                <td colspan="2">7231.687</td>
                <td colspan="2">0.305 (0.270-0.343)</td>
                <td colspan="2">0.825</td>
                <td colspan="2">0</td>
                <td colspan="2">0.56</td>
                <td colspan="2">0.64</td>
                <td colspan="2"> 0.02</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Q1: quartile 1.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>Q2: quartile 2.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>Q3: quartile 3.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>Q4: quartile 4.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>Italicization indicates the highest accuracy.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Distribution of large language model decisions across the provided affiliations. Q1: quartile 1; Q2: quartile 2; Q3: quartile 3; Q4: quartile 4.</p>
          </caption>
          <graphic xlink:href="ai_v5i1e84322_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our findings can inform future efforts to use LLMs in peer review. The data indicated that while affiliation bias was not present in current LLMs, their prediction accuracy was insufficient to replace human reviewers at this time. Notably, the LLMs studied appeared to have distinct “personalities” that preferred to allocate papers to journals of a specific quartile. Across the board, the LLMs avoided allocating to the extremes of Q1 and Q4, instead preferring predictions in Q2 and Q3, corroborating previous studies’ findings that LLMs struggle to provide critical feedback comparable to human reviewers [<xref ref-type="bibr" rid="ref11">11</xref>].</p>
        <p>Our experimental design underscores 2 methods with outsized impact: knowledge retrieval and sampling diversity. RAG shattered the context-window ceiling that limited earlier studies to abstract-level inputs, enabling full-text appraisal that mirrors human practice.</p>
        <p>Another surprising result arose from temperature tuning. We hypothesized that near-deterministic sampling (eg, temperature ≤0.2) would best serve an “objective task”; instead, a moderate temperature of 0.5 struck the sweet spot between rigidity and adaptability. This finding aligns with emerging cognitive science analogies that liken temperature to divergent thinking in human creativity: too low and the model becomes dogmatic, too high and it hallucinates. A logical extension is adaptive temperature schedules, where the model introspects and modulates creativity as necessary—high creativity for speculative synthesis and low for citation verification.</p>
        <p>A key result is the near eradication of affiliation bias. In historical datasets, manuscripts bearing a globally recognizable university crest enjoy acceptance advantages of roughly 1 in 8 decisions [<xref ref-type="bibr" rid="ref6">6</xref>]; in our LLM trials, models consistently predicted in Q2 and Q3. Llama 3.3 and Mistral often chose to place papers in Q2, while Gemma 2, DeepSeek, and Qwen 2.5 more often chose Q3. At first glance, this is cause for celebration: a plausible pathway toward a peer review ecosystem that rewards intellectual merit rather than institutional pedigree. However, an equally important, if less comfortable, lesson is that “LLM objectivity” is conditional and brittle. Gemma 2 and Qwen 2.5 inched uncomfortably close to significance (<italic>P</italic>=.08 and <italic>P</italic>=.054, respectively), raising the possibility that model bias may reemerge as training corpora, instruction-tuning objectives, or deployment prompts shift over time.</p>
        <p>Raw performance metrics make a compelling case for restraint. Exact match accuracy peaked at 35%, a level that would be untenable as the sole basis for publication decisions. Loose match accuracy reached 78%, but this metric is largely a reflection of centrist allocation: with most papers placed in Q2 or Q3, a large proportion fall within one quartile by default rather than due to genuine assessment of scientific merit. These results strongly suggest that current LLMs cannot replace human reviewers. LLMs should be restricted to acting as high-recall assistants that flag methodological red flags and assemble structured digests, while reserving nuanced judgment and field-specific contextualization for human experts.</p>
        <p>Beyond headline accuracy, our study uncovered a subtler, systemic risk: each LLM exhibits a stable preference profile—a “personality” in editorial decisions. Llama 3.3 habitually gravitates toward Q2, while Mistral tends to give generous Q1 and Q2 ratings. By contrast, Qwen 2.5, DeepSeek, and Gemma 2 have the counterintuitive habit of demoting elite-affiliated papers to Q3—a behavior that human reviewers rarely exhibit. These anomalies likely originate deep in the pretraining soup of web pages, blogs, and archival documents where prestige cues intermix with polemics, conspiracies, and outdated citation networks. What appears as “objectivity” may instead be an averaging of contradictory signals rather than a principled neutrality. Crucially, these inherent biases could distort acceptance profiles if models are used in practice. Journals unaware of these quirks risk subtly penalizing high-quality work or promoting safe-but-unremarkable manuscripts. This serves as a warning: without careful monitoring, audits, and ongoing validation, reliance on LLMs for peer review decisions could unintentionally introduce new forms of systematic error rather than reduce bias.</p>
      </sec>
      <sec>
        <title>Limitations and Future Research</title>
        <p>A central limitation is the use of journal quartiles as ground truth labels. Individual articles vary widely within the same journal, editorial decisions reflect a mixture of scientific and contextual factors, and journal prestige can correlate with stylistic conventions (writing density, terminology, and methodological templates) [<xref ref-type="bibr" rid="ref52">52</xref>]. Quartiles offer a practical and standardized metric, but they imperfectly represent article-level quality and may lead LLMs to predict based on venue-specific stylistic cues rather than substantive scientific merit. Future studies should incorporate richer outcome labels, including a 5-option decision set (Q1, Q2, Q3, Q4, or not publishable), reviewer scores, or editorial decisions.</p>
        <p>Additionally, because only published manuscripts were included, the dataset lacks the full spectrum of real-world submissions. Incorporating preprints would more closely approximate authentic peer review conditions.</p>
        <p>Finally, while powerful, LLMs are known to be somewhat unstable: even when provided with identical prompts, an LLM’s outputs may be inconsistent in factual content (hallucinations) or in misinformation provided [<xref ref-type="bibr" rid="ref53">53</xref>]. Future research on this topic may benefit from incorporating multiple trials for each LLM and prompt strategy to account for this variability to determine whether the patterns observed here hold across topics, disciplines, and manuscript types.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study (1) investigated the presence of affiliation bias in LLM peer review, (2) evaluated the proficiency of popular open-source LLMs in the prediction of journal quartiles for transplantation papers, and (3) determined the effect of different prompting methods and temperatures on LLM peer review. While the LLMs were found to be free of affiliation bias, they struggled to provide exact, correct answers. This highlights the limited capacity of current LLMs for autonomous peer review and the nonnegotiable need for human supervision if used. Finally, Mistral had the highest accuracy and efficiency among all the models, and RAG combined with a temperature of 0.5 was the best-performing combination of prompting methods, although by a small margin.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Example data source.</p>
        <media xlink:href="ai_v5i1e84322_app1.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Experimental design and prompt materials.</p>
        <media xlink:href="ai_v5i1e84322_app2.docx" xlink:title="DOCX File , 152 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Comparison of large language model runtimes and sizes.</p>
        <media xlink:href="ai_v5i1e84322_app3.docx" xlink:title="DOCX File , 149 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">RAG</term>
          <def>
            <p>retrieval-augmented generation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ToT</term>
          <def>
            <p>tree of thoughts</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">Q1</term>
          <def>
            <p>quartile 1</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">Q2</term>
          <def>
            <p>quartile 2</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">Q3</term>
          <def>
            <p>quartile 3</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">Q4</term>
          <def>
            <p>quartile 4</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <notes>
      <sec>
        <title>Funding</title>
        <p>This study was partially funded by the National Science Foundation (NSF–IIS/ENG: SCH:/2123683).</p>
      </sec>
    </notes>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tennant</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Ross-Hellauer</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>The limitations to our understanding of peer review</article-title>
          <source>Res Integr Peer Rev</source>
          <year>2020</year>
          <volume>5</volume>
          <fpage>6</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://researchintegrityjournal.biomedcentral.com/articles/10.1186/s41073-020-00092-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s41073-020-00092-1</pub-id>
          <pub-id pub-id-type="medline">32368354</pub-id>
          <pub-id pub-id-type="pii">92</pub-id>
          <pub-id pub-id-type="pmcid">PMC7191707</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanson</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Barreiro</surname>
              <given-names>PG</given-names>
            </name>
            <name name-style="western">
              <surname>Crosetto</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Brockington</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The strain on scientific publishing</article-title>
          <source>Quant Sci Stud</source>
          <year>2024</year>
          <volume>5</volume>
          <issue>4</issue>
          <fpage>823</fpage>
          <lpage>43</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://direct.mit.edu/qss/article/5/4/823/124269/The-strain-on-scientific-publishing"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/qss_a_00327</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dance</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Stop the peer-review treadmill. I want to get off</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>02</month>
          <volume>614</volume>
          <issue>7948</issue>
          <fpage>581</fpage>
          <lpage>3</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/d41586-023-00403-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00403-8</pub-id>
          <pub-id pub-id-type="medline">36781962</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00403-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ellaway</surname>
              <given-names>RH</given-names>
            </name>
          </person-group>
          <article-title>Where have all the reviewers gone?</article-title>
          <source>Adv Health Sci Educ Theory Pract</source>
          <year>2024</year>
          <month>07</month>
          <volume>29</volume>
          <issue>3</issue>
          <fpage>717</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1007/s10459-024-10350-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10459-024-10350-2</pub-id>
          <pub-id pub-id-type="medline">38864958</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10459-024-10350-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chauhan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Currie</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>The impact of generative artificial intelligence on the external review of scientific manuscripts and editorial peer review processes</article-title>
          <source>Am J Pathol</source>
          <year>2024</year>
          <month>10</month>
          <volume>194</volume>
          <issue>10</issue>
          <fpage>1802</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0002-9440(24)00286-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ajpath.2024.08.002</pub-id>
          <pub-id pub-id-type="medline">39128578</pub-id>
          <pub-id pub-id-type="pii">S0002-9440(24)00286-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Gross</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Desai</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Grant</surname>
              <given-names>AO</given-names>
            </name>
            <name name-style="western">
              <surname>Daniels</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Hachinski</surname>
              <given-names>VC</given-names>
            </name>
            <name name-style="western">
              <surname>Gibbons</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Krumholz</surname>
              <given-names>HM</given-names>
            </name>
          </person-group>
          <article-title>Effect of blinded peer review on abstract acceptance</article-title>
          <source>JAMA</source>
          <year>2006</year>
          <month>04</month>
          <day>12</day>
          <volume>295</volume>
          <issue>14</issue>
          <fpage>1675</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1001/jama.295.14.1675"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.295.14.1675</pub-id>
          <pub-id pub-id-type="medline">16609089</pub-id>
          <pub-id pub-id-type="pii">295/14/1675</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>AgentReview: exploring peer review dynamics with LLM agents</article-title>
          <source>ArXiv</source>
          <comment> Preprint posted online on June 18, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2406.12708"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2406.12708</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Floridi</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>The new editorial gatekeepers: understanding LLM-based interfaces, their benefits, risks and design</article-title>
          <source>SSRN</source>
          <year>2025</year>
          <access-date>2026-01-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=5249831">https://papers.ssrn.com/sol3/papers.cfm?abstract_id=5249831</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Madusu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lal</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Is your paper being reviewed by an LLM? Benchmarking AI text detection in peer review</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 26, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2502.19614"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thakkar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Yuksekgonul</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Silberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Garg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Sha</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Vondrick</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Can LLM feedback enhance review quality? A randomized study of 20K reviews at ICLR 2025</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on April 13, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2504.09737"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2504.09737</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Perlis</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Fihn</surname>
              <given-names>SD</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the application of large language models in clinical research contexts</article-title>
          <source>JAMA Netw Open</source>
          <year>2023</year>
          <month>10</month>
          <day>02</day>
          <volume>6</volume>
          <issue>10</issue>
          <fpage>e2335924</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jamanetwork.com/journals/jamanetworkopen/fullarticle/10.1001/jamanetworkopen.2023.35924"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.35924</pub-id>
          <pub-id pub-id-type="medline">37782501</pub-id>
          <pub-id pub-id-type="pii">2809977</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oeding</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Mazzucco</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Dines</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Warren</surname>
              <given-names>RF</given-names>
            </name>
            <name name-style="western">
              <surname>Gulotta</surname>
              <given-names>LV</given-names>
            </name>
            <name name-style="western">
              <surname>Dines</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Kunze</surname>
              <given-names>KN</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT-4 performs clinical information retrieval tasks using consistently more trustworthy resources than does Google Search for queries concerning the Latarjet procedure</article-title>
          <source>Arthroscopy</source>
          <year>2025</year>
          <month>03</month>
          <volume>41</volume>
          <issue>3</issue>
          <fpage>588</fpage>
          <lpage>97</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.arthro.2024.05.025"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.arthro.2024.05.025</pub-id>
          <pub-id pub-id-type="medline">38936557</pub-id>
          <pub-id pub-id-type="pii">S0749-8063(24)00407-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ullah</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Parwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Baig</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Challenges and barriers of using large language models (LLM) such as ChatGPT for diagnostic medicine with a focus on digital pathology - a recent scoping review</article-title>
          <source>Diagn Pathol</source>
          <year>2024</year>
          <month>02</month>
          <day>27</day>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>43</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://diagnosticpathology.biomedcentral.com/articles/10.1186/s13000-024-01464-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13000-024-01464-7</pub-id>
          <pub-id pub-id-type="medline">38414074</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13000-024-01464-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10898121</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannakopoulos</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kavadella</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Aaqel Salim</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stamatopoulos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kaklamanos</surname>
              <given-names>EG</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of the performance of generative AI large language models ChatGPT, Google Bard, and Microsoft Bing Chat in supporting evidence-based dentistry: comparative mixed methods study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>12</month>
          <day>28</day>
          <volume>25</volume>
          <fpage>e51580</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e51580/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/51580</pub-id>
          <pub-id pub-id-type="medline">38009003</pub-id>
          <pub-id pub-id-type="pii">v25i1e51580</pub-id>
          <pub-id pub-id-type="pmcid">PMC10784979</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rane</surname>
              <given-names>NL</given-names>
            </name>
            <name name-style="western">
              <surname>Tawde</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Choudhary</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Rane</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Contribution and performance of ChatGPT and other large language models (LLM) for scientific and research advancements: a double-edged sword</article-title>
          <source>Int Res J Mod Eng Technol Sci</source>
          <year>2023</year>
          <month>10</month>
          <volume>5</volume>
          <issue>10</issue>
          <fpage>875</fpage>
          <lpage>99</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/374616494_Contribution_and_performance_of_ChatGPT_and_other_Large_Language_Models_LLM_for_scientific_and_research_advancements_a_double-edged_sword"/>
          </comment>
          <pub-id pub-id-type="doi">10.56726/IRJMETS45312</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jian</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Leng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Towards scientific intelligence: a survey of LLM-based scientific agents</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on March 31, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2503.24047"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>LLM agents as AI scientists: a survey</article-title>
          <source>Proceedings of the UIUC Spring 2025 CS598 LLM Agent Workshop</source>
          <year>2025</year>
          <conf-name>UIUC Spring 2025</conf-name>
          <conf-date>April 16-19, 2025</conf-date>
          <conf-loc>Urbana, IL</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/forum?id=bfdUWy6rUA"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Si</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hashimoto</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Can LLMs generate novel research ideas? A large-scale human study with 100+ NLP researchers</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on September 6, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2409.04109"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hossain</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sinha</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Bansal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Knipper</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sarkar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Salvador</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mahajan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Guttikonda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Akter</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mahadi Hassan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Freestone</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>MC Jr</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Karmaker</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>LLMs as meta-reviewers' assistants: a case study</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 23, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2402.15589"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2025.naacl-long.395</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Joos</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Keim</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Fischer</surname>
              <given-names>MT</given-names>
            </name>
          </person-group>
          <article-title>Cutting through the clutter: the potential of LLMs for efficient filtration in systematic literature reviews</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on July 15, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2407.10652"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Scherbakov</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hubig</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Jansari</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Bakumenko</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lenert</surname>
              <given-names>LA</given-names>
            </name>
          </person-group>
          <article-title>The emergence of large language models as tools in literature reviews: a large language model-assisted systematic review</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2025</year>
          <month>06</month>
          <day>01</day>
          <volume>32</volume>
          <issue>6</issue>
          <fpage>1071</fpage>
          <lpage>86</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1093/jamia/ocaf063"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaf063</pub-id>
          <pub-id pub-id-type="medline">40332983</pub-id>
          <pub-id pub-id-type="pii">8126534</pub-id>
          <pub-id pub-id-type="pmcid">PMC12089777</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Antoniak</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cheong</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>EY</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Lo</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>AX</given-names>
            </name>
          </person-group>
          <article-title>LLMs as research tools: a large scale survey of researchers' usage and perceptions</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on October 22, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2411.05025"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lepp</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Potts</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>JY</given-names>
            </name>
          </person-group>
          <article-title>Mapping the increasing use of LLMs in scientific papers</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on April 1, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.48550/arxiv.2404.01268"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>VT</given-names>
            </name>
            <name name-style="western">
              <surname>Gartlehner</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yaacoub</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Boutron</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Schwingshackl</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stadelmaier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sommer</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Aboulayeh</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Afach</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Meerpohl</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ravaud</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Sensitivity, specificity and avoidable workload of using a large language models for title and abstract screening in systematic reviews and meta-analyses</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online on December 17, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2023.12.15.23300018v1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2023.12.15.23300018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>von Wedel</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schmitt</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Thiele</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leuner</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shay</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Redaelli</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schaefer</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <article-title>Affiliation bias in peer review of abstracts by a large language model</article-title>
          <source>JAMA</source>
          <year>2024</year>
          <month>01</month>
          <day>16</day>
          <volume>331</volume>
          <issue>3</issue>
          <fpage>252</fpage>
          <lpage>3</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38150261"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2023.24641</pub-id>
          <pub-id pub-id-type="medline">38150261</pub-id>
          <pub-id pub-id-type="pii">2813511</pub-id>
          <pub-id pub-id-type="pmcid">PMC10753437</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Is LLM a reliable reviewer? A comprehensive evaluation of LLM on automatic paper reviewing tasks</article-title>
          <source>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation</source>
          <year>2024</year>
          <conf-name>LREC-COLING 2024</conf-name>
          <conf-date>May 20-25, 2024</conf-date>
          <conf-loc>Torino, Italia</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2024.lrec-main.816/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannakopoulos</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kavadella</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Aaqel Salim</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stamatopoulos</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kaklamanos</surname>
              <given-names>EG</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of the performance of generative AI large language models ChatGPT, Google Bard, and Microsoft Bing Chat in supporting evidence-based dentistry: comparative mixed methods study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>12</month>
          <day>28</day>
          <volume>25</volume>
          <fpage>e51580</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e51580/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/51580</pub-id>
          <pub-id pub-id-type="medline">38009003</pub-id>
          <pub-id pub-id-type="pii">v25i1e51580</pub-id>
          <pub-id pub-id-type="pmcid">PMC10784979</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shopovski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mohdali</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Marolov</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Revolutionizing peer review: a comparative analysis of ChatGPT and human review reports in scientific publishing</article-title>
          <source>Preprints</source>
          <comment>Preprint posted online on February 3, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.preprints.org/manuscript/202502.0058"/>
          </comment>
          <pub-id pub-id-type="doi">10.20944/preprints202502.0058.v1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stokel-Walker</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Van Noorden</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>What ChatGPT and generative AI mean for science</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>02</month>
          <volume>614</volume>
          <issue>7947</issue>
          <fpage>214</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/d41586-023-00340-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00340-6</pub-id>
          <pub-id pub-id-type="medline">36747115</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00340-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gallegos</surname>
              <given-names>IO</given-names>
            </name>
            <name name-style="western">
              <surname>Rossi</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Barrow</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tanjim</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dernoncourt</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>NK</given-names>
            </name>
          </person-group>
          <article-title>Bias and fairness in large language models: a survey</article-title>
          <source>Comput Linguist</source>
          <year>2024</year>
          <month>09</month>
          <volume>50</volume>
          <issue>3</issue>
          <fpage>1097</fpage>
          <lpage>179</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2024.cl-3.8/"/>
          </comment>
          <pub-id pub-id-type="doi">10.1162/coli_a_00524</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pu</surname>
              <given-names>QH</given-names>
            </name>
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>QJ</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>KH</given-names>
            </name>
          </person-group>
          <article-title>Bibliometric analysis of the top-cited articles on islet transplantation</article-title>
          <source>Medicine (Baltimore)</source>
          <year>2017</year>
          <month>11</month>
          <volume>96</volume>
          <issue>44</issue>
          <fpage>e8247</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29095254"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MD.0000000000008247</pub-id>
          <pub-id pub-id-type="medline">29095254</pub-id>
          <pub-id pub-id-type="pii">00005792-201711030-00005</pub-id>
          <pub-id pub-id-type="pmcid">PMC5682773</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conroy</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>How ChatGPT and other AI tools could disrupt scientific publishing</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>10</month>
          <volume>622</volume>
          <issue>7982</issue>
          <fpage>234</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-03144-w</pub-id>
          <pub-id pub-id-type="medline">37817033</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-03144-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Izzo</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lepp</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>McFarland</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>JY</given-names>
            </name>
          </person-group>
          <article-title>Monitoring AI-modified content at scale: a case study on the impact of ChatGPT on AI conference peer reviews</article-title>
          <source>Proceedings of the 41st International Conference on Machine Learning</source>
          <year>2024</year>
          <conf-name>ICML'24</conf-name>
          <conf-date>July 21-27, 2024</conf-date>
          <conf-loc>Vienna, Austria</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.5555/3692070.3693262"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Vodrahalli</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>McFarland</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Can large language models provide useful feedback on research papers? A large-scale empirical analysis</article-title>
          <source>Preprint based on ArXiv</source>
          <year>2023</year>
          <month>10</month>
          <day>03</day>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2310.01783"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arxiv.2310.01783</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Verharen</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT identifies gender disparities in scientific peer review</article-title>
          <source>Elife</source>
          <year>2023</year>
          <month>11</month>
          <day>03</day>
          <volume>12</volume>
          <fpage>RP90230</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37922198"/>
          </comment>
          <pub-id pub-id-type="doi">10.7554/eLife.90230</pub-id>
          <pub-id pub-id-type="medline">37922198</pub-id>
          <pub-id pub-id-type="pii">90230</pub-id>
          <pub-id pub-id-type="pmcid">PMC10624422</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bansal</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Prompt engineering importance and applicability with generative AI</article-title>
          <source>J Comput Commun</source>
          <year>2024</year>
          <volume>12</volume>
          <issue>10</issue>
          <fpage>14</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.4236/jcc.2024.1210002"/>
          </comment>
          <pub-id pub-id-type="doi">10.4236/jcc.2024.1210002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singh Chawla</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Is ChatGPT corrupting peer review? Telltale words hint at AI use</article-title>
          <source>Nature</source>
          <year>2024</year>
          <month>04</month>
          <volume>628</volume>
          <issue>8008</issue>
          <fpage>483</fpage>
          <lpage>4</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/d41586-024-01051-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/d41586-024-01051-2</pub-id>
          <pub-id pub-id-type="medline">38600197</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-024-01051-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garfunkel</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Ulshen</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Hamrick</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lawson</surname>
              <given-names>EE</given-names>
            </name>
          </person-group>
          <article-title>Effect of institutional prestige on reviewers' recommendations and editorial decisions</article-title>
          <source>JAMA</source>
          <year>1994</year>
          <month>07</month>
          <day>13</day>
          <volume>272</volume>
          <issue>2</issue>
          <fpage>137</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="medline">8015125</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lutkevich</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>30 of the best large language models in 2026</article-title>
          <source>Informa TechTarget</source>
          <access-date>2026-02-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.techtarget.com/whatis/feature/12-of-the-best-large-language-models">https://www.techtarget.com/whatis/feature/12-of-the-best-large-language-models</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="web">
          <source>SCImago Journal &#38; Country Rank</source>
          <access-date>2026-01-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.scimagojr.com/">https://www.scimagojr.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Dishant</collab>
          </person-group>
          <source>Editage</source>
          <year>2024</year>
          <access-date>2026-01-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.editage.us/blog/guide-to-journal-rankings-what-are-quartiles-q1-q2-q3-q4-journal/">https://www.editage.us/blog/guide-to-journal-rankings-what-are-quartiles-q1-q2-q3-q4-journal/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jegou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Douze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Faiss: a library for efficient similarity search</article-title>
          <source>Engineering at Meta</source>
          <year>2017</year>
          <month>3</month>
          <day>29</day>
          <access-date>2026-01-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/">https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thelwall</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kousha</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Makita</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Abdoli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stuart</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Levitt</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>In which fields do higher impact journals publish higher quality articles?</article-title>
          <source>Scientometrics</source>
          <year>2023</year>
          <month>05</month>
          <day>18</day>
          <volume>128</volume>
          <issue>7</issue>
          <fpage>3915</fpage>
          <lpage>33</lpage>
          <pub-id pub-id-type="doi">10.1007/s11192-023-04735-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abramo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>D’Angelo</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Di Costa</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Correlating article citedness and journal impact: an empirical investigation by field on a large-scale dataset</article-title>
          <source>Scientometrics</source>
          <year>2023</year>
          <month>01</month>
          <day>09</day>
          <volume>128</volume>
          <fpage>1877</fpage>
          <lpage>94</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1007/s11192-022-04622-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11192-022-04622-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <article-title>Prompt engineering guide</article-title>
          <source>Prompt Engineering</source>
          <access-date>2026-01-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.promptingguide.ai/">https://www.promptingguide.ai/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moskovkin</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>New methodology for calculating webometrics university ranking: from Google Scholar to OpenAlex</article-title>
          <source>Preprints.org</source>
          <comment>Preprint posted online on August 18, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.20944/preprints202508.1191.v1"/>
          </comment>
          <pub-id pub-id-type="doi">10.20944/preprints202508.1191.v1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Naioti</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mudrak</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Using adjusted standardized residuals for interpreting contingency tables</article-title>
          <source>Cornell Statistical Consulting Unit</source>
          <access-date>2026-01-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cscu.cornell.edu/wp-content/uploads/conttableresid.pdf">https://cscu.cornell.edu/wp-content/uploads/conttableresid.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Latona</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>Horta Ribeiro</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Davidson</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Veselovsky</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>West</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The AI review lottery: widespread AI-assisted peer reviews boost paper scores and acceptance rates</article-title>
          <source>ArXiv</source>
          <comment> Preprint posted online on May 3, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2405.02150"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3757667</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Are we there yet? Revealing the risks of utilizing large language models in scholarly peer review</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on December 2, 2014</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2412.01708"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Inferenz</collab>
          </person-group>
          <article-title>Why RAG still matters: beyond token limits in LLMs</article-title>
          <source>Medium</source>
          <year>2024</year>
          <access-date>2026-01-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medium.com/@InferenzTech/why-rag-still-matters-beyond-token-limits-in-llms-289d16a930af">https://medium.com/@InferenzTech/why-rag-still-matters-beyond-token-limits-in-llms-289d16a930af</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Mind the blind spots: a focus-level evaluation framework for LLM reviews</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 24, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2502.17086"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2025.emnlp-main.1805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seglen</surname>
              <given-names>PO</given-names>
            </name>
          </person-group>
          <article-title>Why the impact factor of journals should not be used for evaluating research</article-title>
          <source>BMJ</source>
          <year>1997</year>
          <month>02</month>
          <day>15</day>
          <volume>314</volume>
          <issue>7079</issue>
          <fpage>498</fpage>
          <lpage>502</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/9056804"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.314.7079.497</pub-id>
          <pub-id pub-id-type="medline">9056804</pub-id>
          <pub-id pub-id-type="pmcid">PMC2126010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patwardhan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vaidya</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kundu</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Automated consistency analysis of LLMs</article-title>
          <source>ArXiv</source>
          <comment>Preprint posted online on February 10, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/html/2502.07036v1"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
