<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e68097</article-id><article-id pub-id-type="doi">10.2196/68097</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Critical Assessment of Large Language Models&#x2019; (ChatGPT) Performance in Data Extraction for Systematic Reviews: Exploratory Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Mahmoudi</surname><given-names>Hesam</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chang</surname><given-names>Doris</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Hannah</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ghaffarzadegan</surname><given-names>Navid</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Jalali</surname><given-names>Mohammad S</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>MGH Institute for Technology Assessment, Harvard Medical School</institution><addr-line>125 Nashua St</addr-line><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Industrial and System Engineering Department, Virginia Tech</institution><addr-line>Falls Church</addr-line><addr-line>VA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Dankar</surname><given-names>Fida</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Thies</surname><given-names>Bill</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hang</surname><given-names>Ching Nam</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mohammad S Jalali, PhD, MGH Institute for Technology Assessment, Harvard Medical School, 125 Nashua St, Boston, MA, 02114, United States, 1 6177243738; <email>msjalali@mgh.harvard.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>11</day><month>9</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e68097</elocation-id><history><date date-type="received"><day>28</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>27</day><month>03</month><year>2025</year></date><date date-type="accepted"><day>14</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Hesam Mahmoudi, Doris Chang, Hannah Lee, Navid Ghaffarzadegan, Mohammad S Jalali. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 11.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e68097"/><abstract><sec><title>Background</title><p>Systematic literature reviews (SLRs) are foundational for synthesizing evidence across diverse fields and are especially important in guiding research and practice in health and biomedical sciences. However, they are labor intensive due to manual data extraction from multiple studies. As large language models (LLMs) gain attention for their potential to automate research tasks and extract basic information, understanding their ability to accurately extract explicit data from academic papers is critical for advancing SLRs.</p></sec><sec><title>Objective</title><p>Our study aimed to explore the capability of LLMs to extract both explicitly outlined study characteristics and deeper, more contextual information requiring nuanced evaluations, using ChatGPT (GPT-4).</p></sec><sec sec-type="methods"><title>Methods</title><p>We screened the full text of a sample of COVID-19 modeling studies and analyzed three basic measures of study settings (ie, analysis location, modeling approach, and analyzed interventions) and three complex measures of behavioral components in models (ie, mobility, risk perception, and compliance). To extract data on these measures, two researchers independently extracted 60 data elements using manual coding and compared them with the responses from ChatGPT to 420 queries spanning 7 iterations.</p></sec><sec sec-type="results"><title>Results</title><p>ChatGPT&#x2019;s accuracy improved as prompts were refined, showing improvements of 33% and 23% between the initial and final iterations for extracting study settings and behavioral components, respectively. In the initial prompts, 26 (43.3%) of 60 ChatGPT responses were correct. However, in the final iteration, ChatGPT extracted 43 (71.7%) of the 60 data elements, showing better performance in extracting explicitly stated study settings (28/30, 93.3%) than in extracting subjective behavioral components (15/30, 50%). Nonetheless, the varying accuracy across measures highlighted its limitations.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings underscore LLMs&#x2019; utility in extracting basic as well as explicit data in SLRs by using effective prompts. However, the results reveal significant limitations in handling nuanced, subjective criteria, emphasizing the necessity for human oversight.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>generative artificial intelligence</kwd><kwd>systematic reviews</kwd><kwd>evidence synthesis</kwd><kwd>human-AI collaboration</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Systematic literature reviews (SLRs) are indispensable across various fields, synthesizing evidence to inform decision-making in areas as diverse as public health, policy, and biomedical sciences, where rigor and comprehensiveness are paramount. With the rapid expansion of the literature, SLRs are more important than ever to help not only synthesize evidence but also identify areas in which the literature is robust or deficient [<xref ref-type="bibr" rid="ref1">1</xref>]. However, conducting SLRs is resource intensive, involving manual and careful screening of potentially relevant studies [<xref ref-type="bibr" rid="ref2">2</xref>]. In particular, SLRs that assess and report on analytical methods and key findings require more domain-specific expertise and multiple researchers for coding, making them more challenging.</p><p>Given the recent rapid advancement of large language models (LLMs), researchers have proposed their potential utility in conducting SLRs [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Several reviews have found that artificial intelligence (AI)&#x2013;enabled methods exhibit reasonable performance and improved efficiency in literature screening [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref9">9</xref>], an integral component of SLRs. In particular, the role of SLRs in health and biomedical sciences underscores the need for reliable, accurate data extraction tools that maintain the rigorous standards expected in these fields. However, studies that have tested the capabilities of AI for data extraction have identified challenges necessitating human intervention for completion [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>To understand whether recently developed LLMs can overcome this barrier, many studies have evaluated the performance of various LLMs in automating SLR tasks [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Their findings reveal LLMs&#x2019; potential in extracting data that are relatively easily retrievable (ie, study design, participant characteristics, and primary outcomes). However, these studies have not explored the ability of LLMs to extract more complex data (eg, methods used to obtain study outcomes), which may pose greater challenges as extracting such complex information is often subject to individual researchers&#x2019; perspectives [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>LLMs currently demonstrate advantages in assisting researchers, such as having rapid response times and providing high-level summaries of results [<xref ref-type="bibr" rid="ref15">15</xref>]; however, a recent study revealed the shortcomings in ChatGPT&#x2019;s depth of knowledge and contextual understanding for conducting SLRs in comparison to researchers [<xref ref-type="bibr" rid="ref16">16</xref>]. Yet, there is growing potential for ChatGPT&#x2019;s utility as an assistant in complex qualitative content analysis, as shown by a recent study that assessed its ability to categorize strategies and behaviors in forum posts about reducing sugar intake [<xref ref-type="bibr" rid="ref17">17</xref>]. Thus, although LLMs show potential in data extraction tasks for SLRs, there is more to be explored on the current LLMs&#x2019; capability to undertake a comprehensive approach that involves extracting not only basic study characteristics but also information critical for interpreting the results of studies within SLRs.</p><p>In this study, we aimed to evaluate an LLM&#x2019;s (ChatGPT) ability to extract more complex, nuanced data from scientific studies, representing a novel approach that goes beyond the simpler tasks of extracting descriptive information, such as study design or participant characteristics, which have been the primary focus of prior research. Our study applied a structured series of prompts and validations to systematically gauge where LLMs excel and where human oversight remains essential.</p><p>We used ChatGPT (GPT-4) for its accessibility and ease of use, recognizing that most individuals conducting SLRs may not be proficient with more customizable, technical platforms, such as the GPT application programming interface (API). Sophisticated LLM frameworks (eg, retrieval-augmented generation) offer technical capabilities that may better address needs for SLRs, but these features generally require AI-specific expertise. Consequently, our study prioritized a more common usage scenario in which a widely accessible GPT model is used for data extraction tasks.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>We focused on COVID-19 simulation modeling studies as a case study, leveraging a large collection of study reports that our team recently assembled for a SLR that aimed to assess the incorporation of human behavior dynamics in COVID-19 simulation models [<xref ref-type="bibr" rid="ref18">18</xref>]. For this study, the data elements selected for extraction were aligned with the specific objectives of our review: to evaluate ChatGPT&#x2019;s capacity to manage both explicit and nuanced data in SLRs. To determine whether LLMs can effectively screen papers and extract information, we randomly chose 10 of the papers and extracted data both manually and using ChatGPT. We selected a sample of 10 studies [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref28">28</xref>] as this number was manageable for an in-depth exploration of each paper, allowing us to conduct detailed comparisons of extraction accuracy and making it feasible to perform multiple iterations of prompt engineering and assessment throughout the process.</p><p>Our examination of COVID-19 modeling studies leveraged our team&#x2019;s expertise and enabled us to confidently determine correct answers for meaningful comparisons with ChatGPT&#x2019;s outputs. Although COVID-19 may no longer be at the forefront of global attention, these studies remain a <italic>methodologically rich test case</italic> rather than a focus of current clinical relevance. They combine straightforward features (eg, study design, interventions) with more abstract elements of human behavior (eg, compliance with public health measures, mobility changes), allowing us to systematically assess ChatGPT&#x2019;s performance across varying levels of complexity. Furthermore, the pandemic&#x2019;s urgency previously highlighted the value of accurate, rapid synthesis of research findings during health emergencies, making it an ideal context for evaluating the practical utility of LLMs in accelerating SLRs.</p><p>Many researchers who conduct SLRs may not necessarily have the expertise required to implement LLMs through advanced tools. Thus, we conducted our analysis using ChatGPT, given that it is one of the most widely adopted models, attracting 393 million users each month, as shown by October 2024 data [<xref ref-type="bibr" rid="ref29">29</xref>]. We used the web browser interface for its user-friendly design, as opposed to an API. Although using the API might have offered more controllable responses, our study prioritized accessibility and usability, reflecting the real-world context in which many researchers engage with ChatGPT. We selected the GPT-4 model specifically, as at the time of our analysis, it was one of the few that directly analyzed full text as a PDF file. Although our approach could also be applied to studies available in other formats, our focus on PDF files reflects their frequent use in academic publishing and SLRs.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study did not involve human participants, identifiable human data, or interaction with individuals. As such, it did not fall under the scope of research requiring review by an institutional review board, and ethical approval was not required.</p></sec><sec id="s2-3"><title>Data Elements</title><p>We defined the sets of measures to be extracted as (1) study settings (ie, analysis location, modeling approach, and analyzed interventions) and (2) behavioral components (ie, changes in travel and mobility, perception of risk and severity, and compliance and resistance to public health measures). Therefore, for each of the 10 studies [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref28">28</xref>], we extracted 6 distinct data elements, resulting in 60 data elements.</p><p>We distinguished these measures to reflect their nature: study settings are straightforward, whereas extracting information on behavioral components in COVID-19 models is influenced by researchers&#x2019; perspectives [<xref ref-type="bibr" rid="ref13">13</xref>]. We confined our study settings to information explicitly stated in the text. For behavioral components, we categorized them into no mention (A), mentioned but not modeled (B), modeled exogenously but not analyzed (C), modeled exogenously and analyzed (D), modeled endogenously but not analyzed (E), and modeled endogenously and analyzed (F). Endogenous modeling incorporated human behavior as an internal part of the model, where it both influenced and was influenced by the spread of COVID-19. Exogenous modeling indicated behavioral changes were external factors impacting the spread of COVID-19 without being influenced by it. This classification ranged from minimal (A) to comprehensive (F) incorporation into COVID-19 models.</p></sec><sec id="s2-4"><title>Data Extraction Process and Comparison</title><p>Two trained researchers independently extracted data from the 10 studies [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref28">28</xref>] and then reconciled discrepancies in their findings&#x2014;6 related to study settings and 11 to behavioral components&#x2014;for convergence. The researchers discussed any unresolved discrepancies with a third senior researcher to reach a consensus. Subsequently, we initiated a dedicated session for every study and prompt, uploading the individual files (in PDF) into the GPT-4 model using ChatGPT&#x2019;s user interface (accessed in January-April 2024) and documented the responses. To gain confidence in our manual screening, wherever ChatGPT, consistently through iterations of prompts, provided answers that disagreed with our manual coding, we reassessed our original codings. Following this reassessment and after making necessary adjustments, we finalized the manual screening results and considered them as the correct responses. We calculated individual researchers&#x2019; average accuracy rates (the percentage of their correct responses before any consensus was reached), allowing us to directly compare ChatGPT&#x2019;s performance against the individual researchers&#x2019; average accuracy for each measure throughout prompt iterations [<xref ref-type="bibr" rid="ref8">8</xref>].</p></sec><sec id="s2-5"><title>Prompt Engineering</title><p>We started by providing ChatGPT with a general prompt to extract each desired data element. Due to initially unsatisfactory results, we iteratively engineered prompts based on the initial responses and our manual coding. This process involved altering the wording of prompts for clarity and concisely adding specific descriptions of our objective, approach, and definitions of key terms [<xref ref-type="bibr" rid="ref30">30</xref>]. Particularly, we often guided ChatGPT to base any interpretations strictly on what was explicitly stated in the text, as it often made incorrect inferences. Furthermore, we followed up with ChatGPT regarding its incorrect responses by inquiring about potential improvements to the prompts after clarifying the desired answers [<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>We continued to refine the prompts until we achieved complete alignment with our manual screening results, reached saturation in improvement, or explored viable avenues for prompt enhancement to the best of our capabilities. Given the exploratory nature of this study and the fact that LLMs are designed to interact with users in real time, prompt refinement without a formal training phase reflects a common use case for this technology. Recent tutorials and case studies have demonstrated ChatGPT&#x2019;s feasibility in domain-specific and rapid literature reviews, reinforcing its relevance as a practical, real-world tool that benefits from iterative, user-guided prompt refinement [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. <xref ref-type="table" rid="table1">Table 1</xref> illustrates an example of the iterative process used to prompt ChatGPT for extracting data elements in this study. We applied a similar method to extract additional data elements, with the processes reported in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Example of the iterative process of prompt engineering to extract data elements.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Version</td><td align="left" valign="bottom">Description of prompt modification</td><td align="left" valign="bottom">Prompt</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Initial prompt</td><td align="left" valign="top">What is the simulation modeling approach used in this paper?</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Improved clarity</td><td align="left" valign="top">What model is used?</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Focused output</td><td align="left" valign="top">Specify the overall type of model used in the study. If the modeling approach is unspecified, please state so.</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Avoid overreporting</td><td align="left" valign="top">Specify the overall type (as opposed to the name) of the model used in the study. If the paper does not explicitly introduce the type of model used, state so by returning &#x201C;unspecified.&#x201D;</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Exclude inferred information</td><td align="left" valign="top">Specify the overall type (as opposed to the name) of the model used in the study. If the PDF file does not explicitly introduce the type of model used, state so by returning &#x201C;unspecified,&#x201D; and do not infer the type of model.</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Emphasis on explicit information</td><td align="left" valign="top">Read this PDF file line by line. Only specify the overall general type, as opposed to the name, of the model used in the study. If the author(s) of this PDF file do not explicitly introduce the type of model used, state so by returning &#x201C;unspecified,&#x201D; and do not infer the type of model. Be sure to only specify the type of foundational analytical model rather than any supplementary methods.</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Step-by-step instructions</td><td align="left" valign="top">Read the provided PDF document line by line, focusing on identifying the general category or type of model mentioned in the study. Your task is to:<list list-type="bullet"><list-item><p>Identify the type of model: Look for any mention of the foundational analytical model used in the research. Specify only the general category or type (eg, regression model) rather than the specific name or variant.</p></list-item><list-item><p>Explicit mention required: If the document does not explicitly mention the type of model used in the analysis, respond with &#x201C;unspecified.&#x201D; Avoid making inferences based on the context or the data presented.</p></list-item><list-item><p>Focus on the foundational model: Concentrate on identifying the primary analytical model that the study is based on. Disregard any supplementary methods, tools, or analytical techniques that are mentioned unless they are integral to the foundational model itself.</p></list-item></list></td></tr></tbody></table></table-wrap></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p><xref ref-type="table" rid="table2">Table 2</xref> summarizes ChatGPT&#x2019;s responses across 6 measures for the last iteration of prompts. These measures were divided into study settings and behavioral components.</p><p>Through the course of iterations, we identified 4 instances where ChatGPT consistently disagreed with our manual coding but was determined to have provided the correct answer upon reassessment. These instances were among the prompts related exclusively to study settings: 3 pertained to answers generated for prompts about interventions analyzed, and 1 addressed the correct location of analysis.</p><p>As a result of iterative prompt engineering, the average accuracy of ChatGPT&#x2019;s responses showed a marked improvement of 33% and 23% between the initial and final iterations for extracting study settings and behavioral components, respectively (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Specifically, in our initial prompts, 26 (43.3%) of 60 ChatGPT responses were correct. However, the latest prompt version yielded 43 (71.6%) correct answers. The iterative responses from ChatGPT and a comparison with our manual screening are detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. By the fourth iteration, ChatGPT outperformed the individual screeners&#x2019; average accuracy in identifying study settings (<xref ref-type="fig" rid="figure1">Figure 1</xref>). However, ChatGPT consistently could not achieve a level of precision comparable to that of manual screeners (<xref ref-type="fig" rid="figure1">Figure 1</xref>) when extracting behavioral components.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>ChatGPT&#x2019;s responses in the final version of the prompts.<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study</td><td align="left" valign="bottom" colspan="3">Measure group 1: study settings</td><td align="left" valign="bottom" colspan="3">Measure group 2: behavioral components in COVID-19 models<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Location of analysis</td><td align="left" valign="bottom">Type of model, as presented by authors</td><td align="left" valign="bottom">Interventions analyzed</td><td align="left" valign="bottom">Changes in travel and mobility</td><td align="left" valign="bottom">Perception of risk and severity</td><td align="left" valign="bottom">Compliance and resistance to public health measures</td></tr></thead><tbody><tr><td align="left" valign="top">Giordano et al [<xref ref-type="bibr" rid="ref19">19</xref>]</td><td align="left" valign="top">Italy</td><td align="left" valign="top">Compartmental model known as the SIDARTHE model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Mass vaccination campaigns</p></list-item><list-item><p>Nonpharmaceutical interventions</p></list-item><list-item><p>Intermittent open-close strategies</p></list-item><list-item><p>Different transmission rates due to new variants</p></list-item></list></td><td align="left" valign="top">A [B]<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">B</td><td align="left" valign="top">B [A]</td></tr><tr><td align="left" valign="top">Tuomisto et al [<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">Helsinki University Hospital region in Finland</td><td align="left" valign="top">Agent-based model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Physical isolation</p></list-item><list-item><p>Testing and tracing</p></list-item><list-item><p>Mobility restrictions</p></list-item><list-item><p>Health care capacity enhancement</p></list-item><list-item><p>(Import of infections)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></p></list-item></list></td><td align="left" valign="top">D</td><td align="left" valign="top">B [A]</td><td align="left" valign="top">C [A]</td></tr><tr><td align="left" valign="top">Ashcroft et al [<xref ref-type="bibr" rid="ref21">21</xref>]</td><td align="left" valign="top">Does not focus on a specific geographic region</td><td align="left" valign="top">Mathematical model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Quarantine</p></list-item><list-item><p>Test-and-release strategies</p></list-item><list-item><p>Reinforced hygiene adherence</p></list-item></list></td><td align="left" valign="top">A</td><td align="left" valign="top">A</td><td align="left" valign="top">C [D]</td></tr><tr><td align="left" valign="top">Sneppen et al [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">Sweden</td><td align="left" valign="top">Agent-based model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Limiting social contacts</p></list-item><list-item><p>Lockdown strategies</p></list-item><list-item><p>Hygiene procedures</p></list-item></list></td><td align="left" valign="top">A</td><td align="left" valign="top">A</td><td align="left" valign="top">B [A]</td></tr><tr><td align="left" valign="top">Wong et al [<xref ref-type="bibr" rid="ref23">23</xref>]</td><td align="left" valign="top">Hong Kong</td><td align="left" valign="top">Unspecified</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Aggressive escalation of border control</p></list-item><list-item><p>Implementing COVID-19 tests for overseas returners</p></list-item><list-item><p>Quarantine measures and social distancing</p></list-item><list-item><p>Active case finding</p></list-item></list></td><td align="left" valign="top">A [B]</td><td align="left" valign="top">A</td><td align="left" valign="top">B [A]</td></tr><tr><td align="left" valign="top">Gostic et al [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">Unknown</td><td align="left" valign="top">Mathematical model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Symptom screening</p></list-item><list-item><p>Risk screening</p></list-item></list></td><td align="left" valign="top">D [B]</td><td align="left" valign="top">A</td><td align="left" valign="top">A [D]</td></tr><tr><td align="left" valign="top">Kinoshita et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">Unknown</td><td align="left" valign="top">Two-type branching process model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Contact tracing</p></list-item><list-item><p>Case isolation</p></list-item></list></td><td align="left" valign="top">A</td><td align="left" valign="top">B</td><td align="left" valign="top">C [A]</td></tr><tr><td align="left" valign="top">Paul et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">Emphasis on South Asia, including India, Bangladesh, and Pakistan</td><td align="left" valign="top">SEIR epidemic model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Lockdown</p></list-item><list-item><p>Social distancing</p></list-item><list-item><p>Individual-based precautionary measures</p></list-item></list></td><td align="left" valign="top">C [D]</td><td align="left" valign="top">D [A]</td><td align="left" valign="top">B</td></tr><tr><td align="left" valign="top">Ebigbo et al [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">Unknown</td><td align="left" valign="top">Model-based on theoretical assumptions</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Routine pre-endoscopy virus testing</p></list-item><list-item><p>High-risk personal protective equipment use</p></list-item><list-item><p>(Pre-endoscopy risk assessment questionnaire)</p></list-item></list></td><td align="left" valign="top">A</td><td align="left" valign="top">A</td><td align="left" valign="top">B [A]</td></tr><tr><td align="left" valign="top">Kim and Paul [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">Unknown</td><td align="left" valign="top">Unspecified</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Automated contact tracing</p></list-item><list-item><p>Use of personal protective equipment</p></list-item><list-item><p>Limited social distancing</p></list-item></list></td><td align="left" valign="top">A</td><td align="left" valign="top">A</td><td align="left" valign="top">C [D]</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>All responses are shortened for presentation.</p></fn><fn id="table2fn2"><p><sup>b</sup>Response categories: A, no mention; B, mentioned but not modeled; C, modeled exogenously but not analyzed; D, modeled exogenously and analyzed; E, modeled endogenously but not analyzed; F, modeled endogenously and analyzed.</p></fn><fn id="table2fn3"><p><sup>c</sup>Brackets indicate manually screened responses, and parentheses flag ChatGPT&#x2019;s additional incorrect info; both are used only in cells with incorrect responses.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Average percentage of correct ChatGPT responses throughout iterations (bars) in comparison to the average accuracy of screeners before consensus (dashed lines) across study settings and behavioral components in COVID-19 models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68097_fig01.png"/></fig><p>Specifically, ChatGPT provided correct responses for all 10 papers [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref28">28</xref>] when prompted to identify the analyzed location and model type used by the second and seventh iterations of prompts, respectively. At most, ChatGPT correctly identified 8 of 10 interventions analyzed within the studies, achieving a peak accuracy of 80% by the sixth iteration (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Conversely, it took 6 and 7 iterations to reach a peak accuracy of 80% (ie, 8 of 10 correct classifications) for classifying how each study assessed changes in travel and mobility and the perception of risk and severity, respectively. Our alignment with ChatGPT&#x2019;s responses for coding compliance and resistance to public health measures only achieved a maximum consistency rate of 3 (30%) correct answers across the 10 studies [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref28">28</xref>] (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><p>Finally, the contrast between the distribution of the manual coding and ChatGPT is presented in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, and the distributions of ChatGPT&#x2019;s responses are presented in Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Percentage of correct ChatGPT responses throughout iterations of each prompt. The red bars depict the progression of ChatGPT&#x2019;s accuracy for the three study setting elements, and the blue bars present the progression for the three behavioral component elements. Since the location of analysis achieved 100% by the second version, we concluded further iterations for this prompt.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e68097_fig02.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our analysis underscores the finding that ChatGPT&#x2019;s assistance in full-text screening of study reports is particularly useful when handling simple inquiries, specifically general study settings, for which details are typically explicit in the text. However, the task of assessing nuances that necessitates drawing inferences (eg, the integration of human behavior in COVID-19 models), where the study may or may not have an explicit discussion, presents a significant challenge for ChatGPT.</p><p>Rather than prolonging the iterative process to elicit correct responses, we adopted a strategy of structuring prompts based on ChatGPT&#x2019;s previous responses to ensure explicitness over the course of 7 iterations. Although it may have been possible to continue iterating to improve response accuracy, such a strategy is impractical in real-world scenarios. In addition, we acknowledge the potential risk of overfitting due to iterative prompt engineering on the same set of studies; however, our primary objective was to explore the utility of ChatGPT in real-world use cases where prompt refinement is common practice. This utility is especially relevant, given the broad reliance on SLRs to guide evidence-based practices across fields, with particular importance in health and biomedical sciences, where decisions impact health outcomes [<xref ref-type="bibr" rid="ref31">31</xref>]. Hence, we limited this study to a manageable number of papers to better understand the limitations of prompt engineering itself. Importantly, the framework we used for evaluating ChatGPT (GPT-4) in COVID-19 modeling studies can be applied similarly to other SLRs regardless of the specific topic.</p><p>We highlighted that ChatGPT&#x2019;s performance is influenced by the explicitness of information within the text, not just by the clarity or objectivity of the prompts. This underscores a nuanced limitation: the technology&#x2019;s current dependency on explicit textual evidence for accurate data extraction. This limitation is notable, even with the use of straightforward prompts, underscoring a significant barrier in LLMs&#x2019; application to literature analysis. For instance, despite clear prompts, ChatGPT often struggled to correctly identify the model type in studies&#x2014;expected to be straightforward&#x2014;unless explicitly mentioned. Even when instructed to label model types as &#x201C;unspecified&#x201D; in the absence of clear documentation, early iterations often resulted in incorrect answers rather than adherence to the &#x201C;unspecified&#x201D; directive. This illustrates that ChatGPT&#x2019;s accuracy is dependent not only on the prompt structure but also on the presence of explicitly detailed textual information. Hence, a central insight from this analysis is the significant obstacle that LLMs encounter when navigating ambiguity, further complicating the tasks of engineering effective prompts.</p><p>Furthermore, among ChatGPT&#x2019;s responses that did not correspond to our manual coding, we observed a tendency for ChatGPT to extrapolate beyond the presented data. Despite instructions to confine responses to the explicit content of each study, ChatGPT often listed additional interventions not stated by the authors. This pattern of overreporting was also evident when categorizing studies based on the extent of their integration of behavioral components into the model. ChatGPT frequently assigned a higher integration level than that supported by the studies&#x2019; text (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). These skewed errors align with ChatGPT&#x2019;s tendency to &#x201C;hallucinate,&#x201D; or provide confidently articulated yet factually unsupported responses [<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Despite these challenges, LLM tools may be useful for SLRs. For 3 (30%) studies, ChatGPT correctly identified and analyzed interventions that we initially overlooked. For example, in Wong et al&#x2019;s study [<xref ref-type="bibr" rid="ref23">23</xref>], ChatGPT identified COVID-19 testing as an intervention in 6 of 7 prompt iterations&#x2014;a detail our original manual assessment missed. This led us to reevaluate the study, and upon confirming ChatGPT&#x2019;s accuracy, we modified our assessment accordingly. Conversely, we revisited our categorization of behavioral components in 6 incidences with which ChatGPT consistently disagreed, but we confirmed that our original manual coding was correct.</p><p>The main contribution of this study is to extend the understanding of current LLMs&#x2019; capabilities and limitations in handling complex data, providing valuable insights for those who conduct SLRs and are exploring the use of LLM platforms. Our observation that ChatGPT outperformed the average accuracy of individual reviewers when identifying study settings underscores its utility as an assistant or a second reviewer in extracting basic measures for SLRs. These results support previous research, which indicates the potential of LLMs for handling basic data extraction tasks effectively [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>] and their use as a collaborator [<xref ref-type="bibr" rid="ref32">32</xref>] or a second rater [<xref ref-type="bibr" rid="ref33">33</xref>] in SLRs. However, for ChatGPT to be effectively used in this role, there remains a need for researchers to provide clear and detailed prompts that provide the relevant context. This approach requires researchers to have a thorough understanding of the context relevant to their inquiries and have access to reliable, coded data elements to directly compare against ChatGPT&#x2019;s responses.</p><p>In terms of extracting complex components, ChatGPT failed to achieve comparable accuracy to that of individual screeners, highlighting the continued necessity of manual data extraction and additional research to overcome the limitations of this technology. Although further testing with a test sample may yield different results, the insufficient performance observed in our training data alone suggests that current LLMs remain unreliable for handling complex data extraction tasks. These findings align with other studies that discuss LLM performance, which similarly conclude that although automation is advancing, prevailing errors emphasize that structured oversight remains critical [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec><sec id="s4-2"><title>Limitations</title><p>This study is subject to several limitations. First, this study focused on one aspect of the SLR process, given that other steps (eg, writing Boolean query formulations [<xref ref-type="bibr" rid="ref35">35</xref>] and screening titles and abstracts [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]) have already been examined in greater detail. Second, we evaluated ChatGPT with only one review topic, which limits the generalizability of our results. Third, we often noticed inconsistencies in ChatGPT&#x2019;s responses for the same prompts, but we did not formally assess reproducibility. Fourth, although accurate data extraction highly depends on the LLMs&#x2019; capabilities in accurately parsing PDF files [<xref ref-type="bibr" rid="ref12">12</xref>], our paper does not quantify the impact of any errors in converting PDF files to text. Nonetheless, our results still highlight the stark differences in LLMs&#x2019; capabilities in accurately extracting simple versus complex data elements. Fifth, we selected ChatGPT for this study due to its wide accessibility and usability. We recognize that our exclusive focus is a limitation, as other models may have provided different insights or comparative performance benchmarks. Future research should explore the capabilities of multiple models to provide a more comprehensive understanding of LLM performance in this context. For example, certain advanced approaches (eg, retrieval-augmented generation) exist to better handle evidence or references from external sources. Since our focus was on a common, practical scenario in which typical reviewers use a publicly accessible GPT model, these specialized methods fall outside the scope of this study. Finally, due to the rapidly changing nature of LLMs, our findings may not hold over time.</p></sec><sec id="s4-3"><title>Conclusion</title><p>Overall, LLMs, and ChatGPT in particular, show promising performance in assisting the extraction of explicitly stated information from the full text of study reports, particularly when limited scientific reasoning is required. However, ChatGPT currently exhibits limited potential for fully automating data extraction across more complex, subjective measures. Our findings emphasize the ongoing necessity of human oversight in handling complex, nuanced data extraction tasks, even as LLMs continue to improve. This position is consistent with broader calls in the literature to adopt a cautious, well-evaluated approach to integrating LLMs into evidence synthesis workflows [<xref ref-type="bibr" rid="ref15">15</xref>]. We highlight an important contribution to human-AI collaboration research, demonstrating the need to integrate AI tools with human oversight in SLRs, particularly in areas where current models fall short.</p></sec></sec></body><back><ack><p>The authors thank Ali Akhavan, Zeynep Hasgul, and Ning-Yuan Georgia Liu, who provided feedback on earlier versions of this paper, as well as the members of the US National Science Foundation (NSF) grant team, who shared feedback on the analysis design. This work was supported by the US NSF, Division of Mathematical Sciences and Division of Social and Economic Sciences (grant agreement 2229819).</p></ack><notes><sec><title>Data Availability</title><p>This study did not involve analysis or acquisition of datasets outside what is reported; however, all data relevant to this paper and the process of research are shared in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></notes><fn-group><fn fn-type="con"><p>HM and MSJ designed the experiment. HM, DC, and HL reviewed the papers to form the ground truth. HL, HM, and DC performed prompt engineering. HL and HM drafted the manuscript. HM and HL visualized results. MSJ and NG provided supervision and funding. All authors have reviewed the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">SLR</term><def><p>systematic literature review</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Owens</surname><given-names>JK</given-names> </name></person-group><article-title>Systematic reviews: brief overview of methods, limitations, and resources</article-title><source>Nurse Author Ed</source><year>2021</year><month>12</month><volume>31</volume><issue>3-4</issue><fpage>69</fpage><lpage>72</lpage><pub-id pub-id-type="doi">10.1111/nae2.28</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Phillips</surname><given-names>V</given-names> </name><name name-style="western"><surname>Barker</surname><given-names>E</given-names> </name></person-group><article-title>Systematic reviews: structure, form and content</article-title><source>J Perioper Pract</source><year>2021</year><month>09</month><volume>31</volume><issue>9</issue><fpage>349</fpage><lpage>353</lpage><pub-id pub-id-type="doi">10.1177/1750458921994693</pub-id><pub-id pub-id-type="medline">34228554</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hossain</surname><given-names>MM</given-names> </name></person-group><article-title>Using ChatGPT and other forms of generative AI in systematic reviews: challenges and opportunities</article-title><source>J Med Imaging Radiat Sci</source><year>2024</year><month>03</month><volume>55</volume><issue>1</issue><fpage>11</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1016/j.jmir.2023.11.005</pub-id><pub-id pub-id-type="medline">38040497</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahuli</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Rai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mahuli</surname><given-names>AV</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name></person-group><article-title>Application ChatGPT in conducting systematic reviews and meta-analyses</article-title><source>Br Dent J</source><year>2023</year><month>07</month><volume>235</volume><issue>2</issue><fpage>90</fpage><lpage>92</lpage><pub-id pub-id-type="doi">10.1038/s41415-023-6132-y</pub-id><pub-id pub-id-type="medline">37500847</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Automated medical literature screening using artificial intelligence: a systematic review and meta-analysis</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>07</month><day>12</day><volume>29</volume><issue>8</issue><fpage>1425</fpage><lpage>1432</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac066</pub-id><pub-id pub-id-type="medline">35641139</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blaizot</surname><given-names>A</given-names> </name><name name-style="western"><surname>Veettil</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Saidoung</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Using artificial intelligence methods for systematic review in health sciences: a systematic review</article-title><source>Res Synth Methods</source><year>2022</year><month>05</month><volume>13</volume><issue>3</issue><fpage>353</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1553</pub-id><pub-id pub-id-type="medline">35174972</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de la Torre-L&#x00F3;pez</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ram&#x00ED;rez</surname><given-names>A</given-names> </name><name name-style="western"><surname>Romero</surname><given-names>JR</given-names> </name></person-group><article-title>Artificial intelligence to automate the systematic review of scientific literature</article-title><source>Computing</source><year>2023</year><month>10</month><volume>105</volume><issue>10</issue><fpage>2171</fpage><lpage>2194</lpage><pub-id pub-id-type="doi">10.1007/s00607-023-01181-x</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jonnalagadda</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>P</given-names> </name><name name-style="western"><surname>Huffman</surname><given-names>MD</given-names> </name></person-group><article-title>Automating data extraction in systematic reviews: a systematic review</article-title><source>Syst Rev</source><year>2015</year><month>06</month><day>15</day><volume>4</volume><issue>1</issue><fpage>78</fpage><pub-id pub-id-type="doi">10.1186/s13643-015-0066-7</pub-id><pub-id pub-id-type="medline">26073888</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fabiano</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bhambra</surname><given-names>N</given-names> </name><etal/></person-group><article-title>How to optimize the systematic review process using AI tools</article-title><source>JCPP Adv</source><year>2024</year><month>06</month><volume>4</volume><issue>2</issue><fpage>e12234</fpage><pub-id pub-id-type="doi">10.1002/jcv2.12234</pub-id><pub-id pub-id-type="medline">38827982</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khraisha</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Put</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kappenberg</surname><given-names>J</given-names> </name><name name-style="western"><surname>Warraitch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hadfield</surname><given-names>K</given-names> </name></person-group><article-title>Can large language models replace humans in systematic reviews? Evaluating GPT-4&#x2019;s efficacy in screening and extracting data from peer-reviewed and grey literature in multiple languages</article-title><source>Res Synth Methods</source><year>2024</year><month>07</month><volume>15</volume><issue>4</issue><fpage>616</fpage><lpage>626</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1715</pub-id><pub-id pub-id-type="medline">38484744</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gartlehner</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kahwati</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hilscher</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Data extraction for evidence synthesis using a large language model: a proof-of-concept study</article-title><source>Res Synth Methods</source><year>2024</year><month>07</month><volume>15</volume><issue>4</issue><fpage>576</fpage><lpage>589</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1710</pub-id><pub-id pub-id-type="medline">38432227</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Konet</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>I</given-names> </name><name name-style="western"><surname>Gartlehner</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Performance of two large language models for data extraction in evidence synthesis</article-title><source>Res Synth Methods</source><year>2024</year><month>09</month><volume>15</volume><issue>5</issue><fpage>818</fpage><lpage>824</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1732</pub-id><pub-id pub-id-type="medline">38895747</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Achter</surname><given-names>S</given-names> </name><name name-style="western"><surname>Borit</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cottineau</surname><given-names>C</given-names> </name><name name-style="western"><surname>Meyer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Polhill</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Radchuk</surname><given-names>V</given-names> </name></person-group><article-title>How to conduct more systematic reviews of agent-based models and foster theory development - taking stock and looking ahead</article-title><source>Environ Model Softw</source><year>2024</year><month>02</month><volume>173</volume><fpage>105867</fpage><pub-id pub-id-type="doi">10.1016/j.envsoft.2023.105867</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>T</given-names> </name><name name-style="western"><surname>Higgins</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Deeks</surname><given-names>JJ</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Higgins</surname><given-names>JPT</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chandler</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cumpston</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>T</given-names> </name><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Welch</surname><given-names>VA</given-names> </name></person-group><article-title>Chapter 5: collecting data</article-title><source>Cochrane Handbook for Systematic Reviews of Interventions</source><year>2023</year><access-date>2025-08-15</access-date><publisher-name>Cochrane</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.cochrane.org/authors/handbooks-and-manuals/handbook">https://www.cochrane.org/authors/handbooks-and-manuals/handbook</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qureshi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shaughnessy</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gill</surname><given-names>KAR</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Li</surname><given-names>T</given-names> </name><name name-style="western"><surname>Agai</surname><given-names>E</given-names> </name></person-group><article-title>Are ChatGPT and large language models &#x201C;the answer&#x201D; to bringing us closer to systematic review automation?</article-title><source>Syst Rev</source><year>2023</year><month>04</month><day>29</day><volume>12</volume><issue>1</issue><fpage>72</fpage><pub-id pub-id-type="doi">10.1186/s13643-023-02243-z</pub-id><pub-id pub-id-type="medline">37120563</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mostafapour</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fortier</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Pacheco</surname><given-names>K</given-names> </name><name name-style="western"><surname>Murray</surname><given-names>H</given-names> </name><name name-style="western"><surname>Garber</surname><given-names>G</given-names> </name></person-group><article-title>Evaluating literature reviews conducted by humans versus ChatGPT: comparative study</article-title><source>JMIR AI</source><year>2024</year><month>08</month><day>19</day><volume>3</volume><issue>1</issue><fpage>e56537</fpage><pub-id pub-id-type="doi">10.2196/56537</pub-id><pub-id pub-id-type="medline">39159446</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bijker</surname><given-names>R</given-names> </name><name name-style="western"><surname>Merkouris</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Dowling</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Rodda</surname><given-names>SN</given-names> </name></person-group><article-title>ChatGPT for automated qualitative research: content analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>07</month><day>25</day><volume>26</volume><issue>1</issue><fpage>e59050</fpage><pub-id pub-id-type="doi">10.2196/59050</pub-id><pub-id pub-id-type="medline">39052327</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name><name name-style="western"><surname>Mahmoudi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Jalali</surname><given-names>MS</given-names> </name></person-group><article-title>Review of human behavior integration in COVID-19 modeling studies</article-title><source>J Public Health (Oxf)</source><year>2025</year><month>07</month><day>12</day><fpage>fdaf082</fpage><pub-id pub-id-type="doi">10.1093/pubmed/fdaf082</pub-id><pub-id pub-id-type="medline">40650616</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giordano</surname><given-names>G</given-names> </name><name name-style="western"><surname>Colaneri</surname><given-names>M</given-names> </name><name name-style="western"><surname>Di Filippo</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Modeling vaccination rollouts, SARS-CoV-2 variants and the requirement for non-pharmaceutical interventions in Italy</article-title><source>Nat Med</source><year>2021</year><month>06</month><volume>27</volume><issue>6</issue><fpage>993</fpage><lpage>998</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01334-5</pub-id><pub-id pub-id-type="medline">33864052</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tuomisto</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Yrj&#x00F6;l&#x00E4;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolehmainen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bonsdorff</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pekkanen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tikkanen</surname><given-names>T</given-names> </name></person-group><article-title>An agent-based epidemic model REINA for COVID-19 to identify destructive policies</article-title><source>medRxiv</source><comment>Preprint posted online on  Apr 17, 2020</comment><pub-id pub-id-type="doi">10.1101/2020.04.09.20047498</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ashcroft</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lehtinen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Angst</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Low</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bonhoeffer</surname><given-names>S</given-names> </name></person-group><article-title>Quantifying the impact of quarantine duration on COVID-19 transmission</article-title><source>Elife</source><year>2021</year><month>02</month><day>5</day><volume>10</volume><fpage>1</fpage><lpage>33</lpage><pub-id pub-id-type="doi">10.7554/eLife.63704</pub-id><pub-id pub-id-type="medline">33543709</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sneppen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Simonsen</surname><given-names>L</given-names> </name></person-group><article-title>Impact of superspreaders on dissemination and mitigation of COVID-19</article-title><source>medRxiv</source><comment>Preprint posted online on  Jul 3, 2020</comment><pub-id pub-id-type="doi">10.1101/2020.05.17.20104745</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>MCS</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>RWY</given-names> </name><name name-style="western"><surname>Chong</surname><given-names>KC</given-names> </name><etal/></person-group><article-title>Stringent containment measures without complete city lockdown to achieve low incidence and mortality across two waves of COVID-19 in Hong Kong</article-title><source>BMJ Glob Health</source><year>2020</year><month>10</month><volume>5</volume><issue>10</issue><fpage>e003573</fpage><pub-id pub-id-type="doi">10.1136/bmjgh-2020-003573</pub-id><pub-id pub-id-type="medline">33028700</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gostic</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gomez</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Mummah</surname><given-names>RO</given-names> </name><name name-style="western"><surname>Kucharski</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Lloyd-Smith</surname><given-names>JO</given-names> </name></person-group><article-title>Estimated effectiveness of symptom and risk screening to prevent the spread of COVID-19</article-title><source>Elife</source><year>2020</year><month>02</month><day>24</day><volume>9</volume><fpage>e55570</fpage><pub-id pub-id-type="doi">10.7554/eLife.55570</pub-id><pub-id pub-id-type="medline">32091395</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kinoshita</surname><given-names>R</given-names> </name><name name-style="western"><surname>Anzai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jung</surname><given-names>SM</given-names> </name><etal/></person-group><article-title>Containment, contact tracing and asymptomatic transmission of novel coronavirus disease (COVID-19): a modelling study</article-title><source>J Clin Med</source><year>2020</year><month>09</month><day>27</day><volume>9</volume><issue>10</issue><fpage>3125</fpage><pub-id pub-id-type="doi">10.3390/jcm9103125</pub-id><pub-id pub-id-type="medline">32992614</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Paul</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chatterjee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bairagi</surname><given-names>N</given-names> </name></person-group><article-title>Prediction on COVID-19 epidemic for different countries: focusing on South Asia under various precautionary measures</article-title><source>medRxiv</source><comment>Preprint posted online on  Apr 11, 2020</comment><pub-id pub-id-type="doi">10.1101/2020.04.08.20055095</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ebigbo</surname><given-names>A</given-names> </name><name name-style="western"><surname>R&#x00F6;mmele</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bartenschlager</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Cost-effectiveness analysis of SARS-CoV-2 infection prevention strategies including pre-endoscopic virus testing and use of high risk personal protective equipment</article-title><source>Endoscopy</source><year>2021</year><month>02</month><volume>53</volume><issue>2</issue><fpage>156</fpage><lpage>161</lpage><pub-id pub-id-type="doi">10.1055/a-1294-0427</pub-id><pub-id pub-id-type="medline">33080647</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Paul</surname><given-names>A</given-names> </name></person-group><article-title>Automated contact tracing: a game of big numbers in the time of COVID-19</article-title><source>J R Soc Interface</source><year>2021</year><month>02</month><volume>18</volume><issue>175</issue><fpage>20200954</fpage><pub-id pub-id-type="doi">10.1098/rsif.2020.0954</pub-id><pub-id pub-id-type="medline">33622147</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Duarte</surname><given-names>F</given-names> </name></person-group><article-title>Number of ChatGPT users</article-title><source>Exploding Topics</source><year>2024</year><month>12</month><access-date>2025-08-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://explodingtopics.com/blog/chatgpt-users">https://explodingtopics.com/blog/chatgpt-users</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Heston</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Khun</surname><given-names>C</given-names> </name></person-group><article-title>Prompt engineering in medical education</article-title><source>International Medical Education</source><year>2023</year><month>08</month><volume>2</volume><issue>3</issue><fpage>198</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.3390/ime2030019</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schmidt</surname><given-names>L</given-names> </name><name name-style="western"><surname>Finnerty Mutlu</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Elmore</surname><given-names>R</given-names> </name><name name-style="western"><surname>Olorisade</surname><given-names>BK</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Higgins</surname><given-names>JPT</given-names> </name></person-group><article-title>Data extraction methods for systematic review (semi)automation: update of a living systematic review</article-title><source>F1000Res</source><year>2023</year><month>10</month><volume>10</volume><fpage>401</fpage><pub-id pub-id-type="doi">10.12688/f1000research.51117.2</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Scherbakov</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hubig</surname><given-names>N</given-names> </name><name name-style="western"><surname>Jansari</surname><given-names>V</given-names> </name><name name-style="western"><surname>Bakumenko</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lenert</surname><given-names>LA</given-names> </name></person-group><article-title>The emergence of large language models (LLM) as a tool in literature reviews: an LLM automated systematic review</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.04600</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Motzfeldt Jensen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brix Danielsen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Riis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ChatGPT-4o can serve as the second rater for data extraction in systematic reviews</article-title><source>PLoS One</source><year>2025</year><month>01</month><day>7</day><volume>20</volume><issue>1</issue><fpage>e0313401</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0313401</pub-id><pub-id pub-id-type="medline">39774443</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name></person-group><article-title>Large language models streamline automated systematic review: a preliminary study</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 9, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.15702</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Scells</surname><given-names>H</given-names> </name><name name-style="western"><surname>Koopman</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zuccon</surname><given-names>G</given-names> </name></person-group><article-title>Can ChatGPT write a good Boolean query for systematic review literature search?</article-title><conf-name>SIGIR '23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval</conf-name><conf-date>Jul 23-27, 2023</conf-date><conf-loc>Taipei, Taiwan</conf-loc><fpage>1426</fpage><lpage>1436</lpage><pub-id pub-id-type="doi">10.1145/3539618.3591703</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Syriani</surname><given-names>E</given-names> </name><name name-style="western"><surname>David</surname><given-names>I</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>G</given-names> </name></person-group><article-title>Screening articles for systematic reviews with ChatGPT</article-title><source>J Comput Lang</source><year>2024</year><month>08</month><volume>80</volume><fpage>101287</fpage><pub-id pub-id-type="doi">10.1016/j.cola.2024.101287</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The iterative process of prompt engineering and additional results.</p><media xlink:href="ai_v4i1e68097_app1.pdf" xlink:title="PDF File, 119 KB"/></supplementary-material></app-group></back></article>