<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e64447</article-id><article-id pub-id-type="doi">10.2196/64447</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title><bold>Large Language Models for Thematic Summarization in Qualitative Health Care Research: Comparative Analysis of Model and Human Performance</bold></article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Castellanos</surname><given-names>Arturo</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Jiang</surname><given-names>Haoqiang</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Gomes</surname><given-names>Paulo</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Vander Meer</surname><given-names>Debra</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Castillo</surname><given-names>Alfred</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Mason School of Business, William &#x0026; Mary</institution><addr-line>Williamsburg</addr-line><addr-line>VA</addr-line><country>United States</country></aff><aff id="aff2"><institution>College of Informatics, Northern Kentucky University</institution><addr-line>Highland Heights</addr-line><addr-line>KY</addr-line><country>United States</country></aff><aff id="aff3"><institution>Information Systems and Business Analytics Department, College of Business, Florida International University</institution><addr-line>11200 SW 8th Street</addr-line><addr-line>Miami</addr-line><addr-line>FL</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Dankar</surname><given-names>Fida</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Chenxu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ash</surname><given-names>Elliott</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Xiaoni</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Feng</surname><given-names>Yebo</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Paulo Gomes, PhD, Information Systems and Business Analytics Department, College of Business, Florida International University, 11200 SW 8th Street, Miami, FL, 33199, United States, 1 305-348-4610; <email>pgomes@fiu.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>4</day><month>4</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e64447</elocation-id><history><date date-type="received"><day>17</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>12</month><year>2024</year></date><date date-type="accepted"><day>27</day><month>02</month><year>2025</year></date></history><copyright-statement>&#x00A9; Arturo Castellanos, Haoqiang Jiang, Paulo Gomes, Debra Vander Meer, Alfred Castillo. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 4.4.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e64447"/><abstract><sec><title>Background</title><p>The application of large language models (LLMs) in analyzing expert textual online data is a topic of growing importance in computational linguistics and qualitative research within health care settings.</p></sec><sec><title>Objective</title><p>The objective of this study was to understand how LLMs can help analyze expert textual data. Topic modeling enables scaling the thematic analysis of content of a large corpus of data, but it still requires interpretation. We investigate the use of LLMs to help researchers scale this interpretation.</p></sec><sec sec-type="methods"><title>Methods</title><p>The primary methodological phases of this project were (1) collecting data representing posts to an online nurse forum, as well as cleaning and preprocessing the data; (2) using latent Dirichlet allocation (LDA) to derive topics; (3) using human categorization for topic modeling; and (4) using LLMs to complement and scale the interpretation of thematic analysis. The purpose is to compare the outcomes of human interpretation with those derived from LLMs.</p></sec><sec sec-type="results"><title>Results</title><p>There is substantial agreement (247/310, 80%) between LLM and human interpretation. For two-thirds of the topics, human evaluation and LLMs agree on alignment and convergence of themes. Furthermore, LLM subthemes offer depth of analysis within LDA topics, providing detailed explanations that align with and build upon established human themes. Nonetheless, LLMs identify coherence and complementarity where human evaluation does not.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs enable the automation of the interpretation task in qualitative research. There are challenges in the use of LLMs for evaluation of the resulting themes.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>generative AI</kwd><kwd>large language models</kwd><kwd>ChatGPT</kwd><kwd>machine learning</kwd><kwd>health care</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Qualitative studies in health care shed light on the perceptions, narratives, and discourses that underlie human behavior. This approach enhances understanding of both clinicians and patients&#x2019; experiences and expectations, thereby informing decision-making for health policy [<xref ref-type="bibr" rid="ref1">1</xref>]. Traditionally, these studies involved data collection through face-to-face interviews, observation or artifact analysis, transcription, and manual human coding for sense-making. Recent online advances, such as social media interactions, online reviews, news articles, and in-depth forum discussions, allow researchers and policy makers to collect larger data samples at lower time costs compared with direct interviews [<xref ref-type="bibr" rid="ref2">2</xref>]. The advent of text mining tools, which allow researchers to cluster text samples into groups based on statistical similarity, has enabled partial automation of the sense-making step. For instance, the use of natural language processing (NLP) to identify risk factors from unstructured free-text clinical notes [<xref ref-type="bibr" rid="ref3">3</xref>]. Yet, these tools provide only the groupings, leaving the human to apply thematic interpretation [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Recent advances in generative artificial intelligence (AI) provide valuable tools for researchers conducting qualitative studies, offering support in both data analysis and interpretation. In particular, large language models (LLMs), which are statistical models built using internet-scale datasets, can generate human-style writing in response to natural-language prompts, and assist in analyzing textual data to identify patterns, themes, and underlying meanings [<xref ref-type="bibr" rid="ref6">6</xref>]. LLMs can aid researchers in conducting thematic analysis by identifying recurrent themes, concepts, or ideas across a dataset supporting the automation of thematic interpretation.</p></sec><sec id="s1-2"><title>Previous Work</title><p>Topic modeling is a popular approach to uncovering insights in text mining. It identifies patterns in word usage and clusters words into topics, making it a popular method for exploring large, unstructured text datasets. Latent Dirichlet allocation (LDA) is a widely applied method for topic modeling. Previous work has used LDA modeling to analyze social media data and derive insights on key topics [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Despite the new perspectives LDA approaches offer for scientific research [<xref ref-type="bibr" rid="ref9">9</xref>], using LDA for topic modeling presents challenges [<xref ref-type="bibr" rid="ref10">10</xref>], notably the significant role of human interpretative contribution in the process [<xref ref-type="bibr" rid="ref11">11</xref>], which limits scalability. In addition, there is a noted lack of user-friendly tools that support the entire workflow, necessitating a human-in-the-loop to interpret the derived topics. In this paper, we argue that LLMs can help resolve some of the challenges of LDA analysis, specifically in interpreting and labeling topics.</p><p>LLMs are emerging as an increasingly reliable and effective tool for interpretative qualitative research, combining the scale that computational techniques allow for with the human&#x2019;s qualitative logic [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Previous studies show that ChatGPT (OpenAI) yields comparable results to manual coding with substantial time savings [<xref ref-type="bibr" rid="ref14">14</xref>]. These studies compare emergent themes in human and AI-generated qualitative analyses, revealing similarities and differences. For instance, some themes are recognized by human coders but missed by ChatGPT, and vice versa [<xref ref-type="bibr" rid="ref15">15</xref>]. LLMs can highlight novel connections within the data that are not apparent to human coders. In both deductive and inductive thematic analysis, ChatGPT extended the researchers&#x2019; views of the themes present in the data [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>There are challenges associated with the use of LLMs. In the previously cited study [<xref ref-type="bibr" rid="ref14">14</xref>], ChatGPT was able to recreate the themes originally identified through more traditional methods. However, it was less successful at identifying subtle, interpretive themes, and more successful with concrete, descriptive themes. LLMs may miss themes that require a deep understanding of context or specific domain knowledge. For example, themes related to niche cultural practices or specific professional areas may not be accurately identified by AI without targeted training.</p><p>LLMs can also reflect biases present in its training data, potentially overlooking or misinterpreting themes that deviate from its learned patterns. On the other hand, LLM analyses can identify patterns and themes that might be overlooked by human coders due to their preconceived notions or cognitive biases. Further challenges associated with the use of LLMs are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Challenges of large language models.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Challenge</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Citations</td></tr></thead><tbody><tr><td align="left" valign="top">Ambiguity resolution</td><td align="left" valign="top">LLMs<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> might struggle to disambiguate certain terms or topics, leading to unclear topic categorization.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]</td></tr><tr><td align="left" valign="top">Overfitting</td><td align="left" valign="top">LLMs can sometimes focus too much on common or popular topics, missing out on niche or less frequently discussed topics.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]</td></tr><tr><td align="left" valign="top">Lack of context</td><td align="left" valign="top">Without external knowledge or the ability to track long-term context, LLMs might misinterpret or miss certain topic nuances.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref20">20</xref>]</td></tr><tr><td align="left" valign="top">Bias</td><td align="left" valign="top">LLMs are trained on vast amounts of data, which may contain biases. This can affect topic analysis results.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]</td></tr><tr><td align="left" valign="top">Overgeneralization</td><td align="left" valign="top">LLMs might overly generalize topics, missing out on specific subtopics or nuances.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref23">23</xref>]</td></tr><tr><td align="left" valign="top">Sensitivity to input</td><td align="left" valign="top">Small changes in input phrasing can sometimes lead to different topic interpretations by the LLM.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref24">24</xref>]</td></tr><tr><td align="left" valign="top">Memory limitations</td><td align="left" valign="top">Due to token limits, LLMs might not capture very long or detailed discussions effectively for topic analysis.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref25">25</xref>]</td></tr><tr><td align="left" valign="top">Interactivity limitations</td><td align="left" valign="top">While LLMs can process static text effectively, they might struggle with dynamic topic analysis, where user feedback or real-time adjustments are required.</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref26">26</xref>]</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap><p>Given these challenges, some studies suggest that the most effective qualitative analyses may involve a combination of human and AI insights, as human coders often recognize nuanced themes related to context, emotions, and cultural subtleties that AI may miss. For example, a study demonstrates the feasibility of AI as a research assistant, presenting a successful case-study of human-AI collaboration in research by merging the efficiency and accuracy of ChatGPT with human flexibility and context awareness [<xref ref-type="bibr" rid="ref27">27</xref>]. In addition, the usefulness of ChatGPT in qualitative analysis may depend on the researcher&#x2019;s ability to ask appropriate questions (prompts), with the output evaluated and supplemented by a human researcher before the final report and publication [<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>There is little guidance in the literature about how LLMs can be integrated into thematic analysis. Challenges associated with the use of LLMs, including overgeneralization and overfitting, need to be investigated in the context of using LLMs for interpreting the relevance of identified topics. Our focus in this work considers inductive thematic analysis, where themes are derived from data without preconceived frameworks, and semantic analysis, in which themes are identified within the explicit content of the data [<xref ref-type="bibr" rid="ref29">29</xref>]. We plan to consider a hybrid inductive and deductive approach in future work [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s1-3"><title>Study Objectives</title><p>This study considers the possibility of enhancing human productivity by applying LLMs in the interpretation and labeling stage of topic modeling. We present a case study in which data were gathered from an online forum and grouped using text mining tools, and then interpreted for themes in parallel: (1) by human coders and (2) by providing text samples from each classification group to an LLM and prompting the LLM for thematic summarization.</p><p>We compared the human- and LLM-generated themes along 4 qualitative dimensions: alignment, convergence, coherence, and complementarity. Based on this analysis, we demonstrate the feasibility of using an LLM to support human thematic interpretation for qualitative research and offer insights into where researchers may find benefit in using LLMs to support thematic interpretation, and where they should exercise caution.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>The proposed methodology is based on three phases: (1) construction of a dataset and topic modeling using LDA, (2) labeling identified groups into topics through human interpretation and through use of LLM, and (3) comparison of identified topics.</p></sec><sec id="s2-2"><title>Data Collection and Preprocessing</title><p>The data comprises discussions from a publicly accessible Nurse Forum [<xref ref-type="bibr" rid="ref4">4</xref>]. Data come from posts aggregated over 28 2-week periods from March 2020 to April 2021. Our preprocessing approach ensures that the data is clean, standardized, and focused on the most relevant linguistic features, allowing for a clearer identification of the key aspects discussed in the nurse forum over time. Texts were tokenized using the Python library Gensim [<xref ref-type="bibr" rid="ref31">31</xref>]. Preprocessing included lowercasing and removing punctuation to ensure uniformity and reduce noise in the text. Stop words, including domain-specific terms like &#x201C;covid&#x201D; and &#x201C;covid 19,&#x201D; were removed, in addition to those in the Natural Language Toolkit (NLTK) library, to focus on meaningful content. Bigram and trigrams were added to the corpora to identify common multiword expressions, which enhances the detection of contextually significant phrases. Finally, texts were lemmatized using SpaCy (Explosion) [<xref ref-type="bibr" rid="ref32">32</xref>], retaining only nouns, adjectives, verbs, and adverbs, to normalize words to their base forms and reduce dimensionality.</p></sec><sec id="s2-3"><title>Topic Modeling</title><p>Topic modeling was conducted using LDA to identify underlying themes in the text data. The LDA algorithm began with random assignments of topics to documents and words to topics. Through iterative optimization, it adjusts these assignments based on the likelihood of word-topic and topic-document distributions. We experimented with different numbers of topics and adjusted hyperparameters, to find the optimal model configuration. Coherence scores, which measure the semantic similarity of words within a topic, were computed for each run. Higher coherence scores indicate more meaningful and interpretable topics. The model with the highest coherence score was selected [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>This optimal model is then used to extract the top keywords for each topic, summarizing the themes present in the data. The distribution of topics across the corpus was visualized to interpret their prevalence in individual documents and the entire dataset, providing insights into the prominent themes discussed in the nurse forum during the specified period.</p></sec><sec id="s2-4"><title>Identification of Topics Through Human Interpretation</title><p>Thematic analysis was conducted by 2 coders working independently to familiarize themselves with the data by exhaustively reading the top 10 posts within each topic (ranked based on coherence scores) generated by the topic models [<xref ref-type="bibr" rid="ref34">34</xref>]. The selected theme names for the labeled topics were compared, which achieved an initial interannotator agreement of 68% (210/310), and 94% (292/310) after a subsequent round. For the remaining 6% (18/310), the underlying posts were examined together to resolve the disagreements, which left no unresolved annotations. The interpretation analysis resulted in 16.5% (15/310) of the identified themes being categorized as having low coherence.</p></sec><sec id="s2-5"><title>Theme Derivation Using Large Language Models</title><p>Following topic modeling, an LLM was used to derive themes from the identified topics. We created a custom function that takes a system message and a list of user-assistant message pairs, ensuring proper formatting and role assignment. We use the GPT-3.5 based model, specifying the structured messages, temperature, and seed for reproducibility [<xref ref-type="bibr" rid="ref35">35</xref>]. The system prompt is embedded ensuring consistency in use of the associated set of instructions. The model was chosen for its advanced NLP capabilities, including context-awareness and adaptability to specific thematic contexts, and accessibility to the research team. The prompt instructs ChatGPT to generate themes and subthemes for more nuanced theme identification, addressing the issue of overly broad categorizations observed in initial experiments. An overview of the modeling steps is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-6"><title>Comparison of Identified Topics</title><p>The reliability of coding textual data can be challenging as the goal in content analysis is to attain a &#x201C;scientific&#x201D; analysis characterized by reliability, which implies stability in the phenomenon being studied and explicit analytic procedures to ensure that any reasonably qualified person would yield identical results [<xref ref-type="bibr" rid="ref36">36</xref>]. Intercoder agreement emerges as a key tool in achieving a reliable coding scheme, assessing the extent to which coders assign identical codes to the same set of data [<xref ref-type="bibr" rid="ref34">34</xref>]. A 5-item ordinal scale typically measures this agreement, with the anchors of &#x201C;Perfect Agreement,&#x201D; representing where coders completely agree on codes or categories assigned to data, and &#x201C;Slight Agreement,&#x201D; representing very little consensus, or significant disagreement, among the coders in how they code the data. This agreement scale is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>A novel 7-point scale was developed following a pilot test conducted by two of the authors to address the complexities of comparing codes generated by humans and ChatGPT. This scale, presented in the first column of <xref ref-type="table" rid="table2">Table 2</xref>, focuses on exploring the complementary and divergent insights between human-generated and ChatGPT-generated codes. It emphasizes the value of examining differences, especially in cases of low coherence among human-coded data, which allows researchers to uncover nuanced perspectives and understandings contained in ChatGPT-generated themes and within subthemes variability, with the possibility of revealing new and meaningful insights. It serves as a dynamic tool that stresses the importance of learning from intercoding differences rather than seeking strict agreement and validation, as is valued among qualitative researchers [<xref ref-type="bibr" rid="ref37">37</xref>].</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Agreement between large language model (LLM) and human coding.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Agreement scale</td><td align="left" valign="bottom">Number of topics, n</td><td align="left" valign="bottom">Rate of agreement, %</td></tr></thead><tbody><tr><td align="left" valign="top">ChatGPT and human coding themes are aligned, coders largely interpret and code the data in a consistent manner.</td><td align="left" valign="top">95</td><td align="left" valign="top">30.6</td></tr><tr><td align="left" valign="top">Substantial agreement: ChatGPT&#x2019;s subthemes are aligned with human coding, some subthemes provide complementary perspectives or unique insights.</td><td align="left" valign="top">101</td><td align="left" valign="top">32.6</td></tr><tr><td align="left" valign="top">Substantial agreement: ChatGPT&#x2019;s themes are divergent, human coding classified as low coherence.</td><td align="left" valign="top">51</td><td align="left" valign="top">16.5</td></tr><tr><td align="left" valign="top">Moderate agreement: there is a reasonable level of consensus between ChatGPT and human coding, but there are significant differences in interpretation or coding for some subthemes.</td><td align="left" valign="top">15</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Fair agreement: ChatGPT&#x2019;s themes are considered too broad, there are substantial discrepancies between ChatGPT&#x2019;s subthemes compared with human coding.</td><td align="left" valign="top">30</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Poor agreement: ChatGPT&#x2019;s theme specific, yet divergent from human coding.</td><td align="left" valign="top">4</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Poor agreement: ChatGPT&#x2019;s theme specific, yet low coherence in human coding.</td><td align="left" valign="top">14</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Grand total</td><td align="left" valign="top">310</td><td align="left" valign="top">79.7</td></tr></tbody></table></table-wrap><p>We then use GPT-4 for topic comparison, accessing the ChatGPT engine through an application programming interface (API) for programmatic purpose. Each prompt included the human-coded themes and the LLM-generated themes, requesting the LLM to assess the agreement based on 4 criteria: alignment, convergence, coherence, and complementarity between the themes. A detailed overview of the prompts is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Alignment assesses the correspondence between ChatGPT and human themes in terms of contextual agreement between the themes [<xref ref-type="bibr" rid="ref38">38</xref>], rather than lexical agreement. Convergence provides a similar comparison at the level of specific &#x201C;ChatGPT Subthemes&#x201D; with reference to the &#x201C;Human Theme.&#x201D; Coherence evaluates the logical consistency within the &#x201C;ChatGPT Theme&#x201D; and its subthemes, emphasizing the cohesion in both logic and meaning [<xref ref-type="bibr" rid="ref39">39</xref>]. Complementarity looks at whether the ChatGPT subthemes offer valuable additional insights or perspectives that enhance the human theme by providing detailed mechanistic explanations that align with and build upon the established human theme without contradicting it [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>],</p><p>LLM outputs were parsed to extract values for alignment, coherence, convergence, and complementarity. Human coders then compared the remaining results of reliability analysis with the LLM-generated comparison.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>This study does not involve human subjects, identifiable private information, or direct interactions with individuals. Instead, it relies exclusively on publicly available, anonymized social media posts. Consequently, institutional review board approval was deemed unnecessary.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Analysis of Reliability</title><p>The LDA analysis identified 310 topics. In thematic analysis, the team considered the topics identified, groups of words, and representative blog post samples in each topic and categorized the 310 topics into 58 subthemes.</p><p>A total of 2 authors independently classified the level of agreement on each topic against the themes and subthemes generated by ChatGPT, using the 7-point agreement scale in <xref ref-type="table" rid="table2">Table 2</xref>. The authors then met to compare assessments and resolve disagreements. The overall reliability is estimated at 79.7% (247/310), which represents substantial agreement according to the intercoder reliability benchmark [<xref ref-type="bibr" rid="ref36">36</xref>].</p><p><xref ref-type="table" rid="table2">Table 2</xref> provides a breakdown of agreement along the comparison scale, with 30.6% (95/310) reflecting taxonomic agreement in themes identified by the human coder and ChatGPT. For example, in one case the human-coder&#x2019;s theme is &#x201C;PPE resource availability and control&#x201D; and the ChatGPT theme is &#x201C;Mask Availability and Usage in Healthcare Settings.&#x201D;</p><p>In 32.6% (101/310) of the themes the agreement is at the subtheme level. For example, in one instance the human-coded theme is &#x201C;Testing policies in different settings,&#x201D; while the ChatGPT theme is &#x201C;Challenges and Controversies Surrounding COVID-19 Testing,&#x201D; which was not considered at the same level of specificity of the human coder&#x2019;s theme. The ChatGPT subthemes are &#x201C;Allocation of Testing Resources,&#x201D; &#x201C;Flaws in Testing Systems,&#x201D; and &#x201C;Impact on Public Health and Society.&#x201D; In the first subtheme the discussion revolves around whether COVID-19 tests should be prioritized for hospitalized patients or health care workers, matching the theme identified by humans. Adding to the reliability of the method, we have the agreement on the lack of coherence of the posts included in the LLM topic, representing 16.5% (51/310) of the topics.</p></sec><sec id="s3-2"><title>Alignment and Convergence</title><p>LLM provided results on alignment and convergence that we compare with the human evaluation of agreement. The results are displayed in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Analysis of alignment (theme level) and convergence (subtheme level).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Alignment: Compare the &#x201C;human theme&#x201D; and the &#x201C;ChatGPT theme&#x201D;</td><td align="left" valign="bottom" colspan="3">Convergence: Compare the specifics in &#x201C;ChatGPT Subtheme&#x201D; with the &#x201C;human theme.&#x201D;</td></tr><tr><td align="left" valign="top">Agreement scale</td><td align="left" valign="top">Total, n</td><td align="left" valign="top">Aligned, n</td><td align="left" valign="top">Misaligned, n</td><td align="left" valign="top">Meets expectation, %</td><td align="left" valign="top">Convergent, n</td><td align="left" valign="top">Divergent, n</td><td align="left" valign="top">Meets expectation, %</td></tr></thead><tbody><tr><td align="left" valign="top">ChatGPT and human coding themes are aligned, coders largely interpret and code the data in a consistent manner.</td><td align="left" valign="top">95</td><td align="left" valign="top">86<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">9</td><td align="left" valign="top">91</td><td align="left" valign="top">86<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">9</td><td align="left" valign="top">91</td></tr><tr><td align="left" valign="top">Substantial agreement: ChatGPT&#x2019;s subthemes are aligned with human coding, some subthemes provide complementary perspectives or unique insights.</td><td align="left" valign="top">101</td><td align="left" valign="top">90<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">11</td><td align="left" valign="top">89</td><td align="left" valign="top">91<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">10</td><td align="left" valign="top">90</td></tr><tr><td align="left" valign="top">Substantial agreement: ChatGPT&#x2019;s themes are divergent, human coding classified as low coherence.</td><td align="left" valign="top">51</td><td align="left" valign="top">6</td><td align="left" valign="top">45<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">88</td><td align="left" valign="top">5</td><td align="left" valign="top">36<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">71</td></tr><tr><td align="left" valign="top">Moderate agreement: there is a reasonable level of consensus between ChatGPT and human coding, but there are significant differences in interpretation or coding for some subthemes.</td><td align="left" valign="top">15</td><td align="left" valign="top">10<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">5</td><td align="left" valign="top">67</td><td align="left" valign="top">11<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">4</td><td align="left" valign="top">73</td></tr><tr><td align="left" valign="top">Fair agreement: ChatGPT&#x2019;s themes are considered too broad, there are substantial discrepancies between ChatGPT subthemes compared with human coding.</td><td align="left" valign="top">30</td><td align="left" valign="top">18</td><td align="left" valign="top">12</td><td align="left" valign="top">0</td><td align="left" valign="top">17</td><td align="left" valign="top">13<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">43</td></tr><tr><td align="left" valign="top">Poor agreement: ChatGPT&#x2019;s theme specific, yet divergent from human coding.</td><td align="left" valign="top">4</td><td align="left" valign="top">1</td><td align="left" valign="top">3<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">75</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Poor agreement: ChatGPT&#x2019;s theme specific, yet low coherence in human coding.</td><td align="left" valign="top">14</td><td align="left" valign="top">2</td><td align="left" valign="top">12<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">86</td><td align="left" valign="top">1</td><td align="left" valign="top">8<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">57</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Expectation is &#x201C;aligned&#x201D; for items in the agreement scale.</p></fn><fn id="table3fn2"><p><sup>b</sup>Expectation is &#x201C;convergent&#x201D; in the agreement scale.</p></fn><fn id="table3fn3"><p><sup>c</sup>Expectation is &#x201C;misaligned&#x201D; in the agreement scale.</p></fn><fn id="table3fn4"><p><sup>d</sup>Expectation is &#x201C;divergent&#x201D; in the agreement scale.</p></fn></table-wrap-foot></table-wrap><p>We found high level of alignment and convergence for themes classified as high on agreement by human coder. For scale item 1 there was 91% (86/95) alignment and 91% (86/95) convergence, and for scale item 2, there was 89% (90/101) alignment and 90% (91/101) convergence. As expected, we find misalignment for scale items 3 and 7.</p><p>The results for scale item 5 (ChatGPT&#x2019;s themes are considered too broad, there are substantial discrepancies between ChatGPT subthemes compared with human coding) reveal specific nuances of the LLM comparison. Although we expect subthemes to be divergent based on human classification, only 43% (13/30) were classified as divergent by the LLM. For example, a topic labeled by human-coders as &#x201C;Knowledge about virus,&#x201D; due to posts in general discuss the nature of COVID-19, was labeled by LLM as &#x201C;COVID-19 and its implications for healthcare workers,&#x201D; which is considered much broader although aligned. However, the first subtheme, &#x201C;Understanding the nature of coronaviruses and COVID-19&#x201D; is both aligned and convergent with human-generated theme while the other two subthemes, &#x201C;Importance of proper PPE and testing for healthcare workers&#x201D; and &#x201C;Concerns and challenges in healthcare settings and home care,&#x201D; are clearly divergent from the narrow scope defined by human-theme. Although some subthemes may be tangential, the LLM still classifies them as convergent within a broader framework of idea similarity.</p></sec><sec id="s3-3"><title>Coherence</title><p>Coherence evaluates the logical consistency within the &#x201C;ChatGPT Theme&#x201D; and its subthemes. The results are displayed in <xref ref-type="table" rid="table4">Table 4</xref>. Coherence was high for items 1,2, and 4 in the agreement scale, meeting expectations. We expected coherence to be low for scale item 5. However, contrary to our expectations, ChatGPT identified 97% (29/30) of cases as coherent. Although human interpretation viewed the LLM theme as broad and the subthemes as tangential, the LLM found logical consistency among these items within the broader scope of the theme.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Analysis of coherence and complementarity.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">Analysis of coherence</td><td align="left" valign="bottom" colspan="2">Analysis of complementarity</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Coherent, n</td><td align="left" valign="top">Low coherence, n</td><td align="left" valign="top">Meets expectation, %</td><td align="left" valign="top">Complementary, n</td><td align="left" valign="top">Meets expectation, %</td></tr></thead><tbody><tr><td align="left" valign="top">ChatGPT and human coding themes are aligned, coders largely interpret and code the data in a consistent manner.</td><td align="left" valign="top">94<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">1</td><td align="left" valign="top">99</td><td align="left" valign="top">93<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">98</td></tr><tr><td align="left" valign="top">Substantial agreement: ChatGPT&#x2019;s subthemes are aligned with human coding, some subthemes provide complementary perspectives or unique insights.</td><td align="left" valign="top">101<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0</td><td align="left" valign="top">100</td><td align="left" valign="top">97<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">96</td></tr><tr><td align="left" valign="top">Substantial agreement: ChatGPT&#x2019;s themes are divergent, human coding classified as low coherence.</td><td align="left" valign="top">49</td><td align="left" valign="top">2<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">4</td><td align="left" valign="top">8</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Moderate agreement: there is a reasonable level of consensus between ChatGPT and human coding, but there are significant differences in interpretation or coding for some subthemes.</td><td align="left" valign="top">15<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">0</td><td align="left" valign="top">100</td><td align="left" valign="top">15<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">Fair agreement: ChatGPT&#x2019;s themes are considered too broad, there are substantial discrepancies between ChatGPT subthemes compared with human coding.</td><td align="left" valign="top">29</td><td align="left" valign="top">1<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">3</td><td align="left" valign="top">26</td><td align="left" valign="top">87</td></tr><tr><td align="left" valign="top">Poor agreement: ChatGPT&#x2019;s theme specific, yet divergent from human coding</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Poor agreement: ChatGPT&#x2019;s theme specific, yet low coherence in human coding</td><td align="left" valign="top">14</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">4</td><td align="left" valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Expectation is &#x201C;coherent&#x201D; in the agreement scale.</p></fn><fn id="table4fn2"><p><sup>b</sup>Expectation is &#x201C;complementary&#x201D; in the agreement scale.</p></fn><fn id="table4fn3"><p><sup>c</sup>Expectation is &#x201C;low coherence&#x201D; in the agreement scale.</p></fn></table-wrap-foot></table-wrap><p>Another unexpected result concerns scale item 3, where 96% (49/51) of the topics were marked as coherent despite being rated as &#x201C;low coherence&#x201D; by human coders. Contrary to expectations, 49 out of 51 cases were classified as coherent. LLM relies on single posts to generate subthemes with logical consistency. We illustrate this finding with 2 examples.</p><p>One topic that ChatGPT themed as &#x201C;Nurses&#x2019; Safety and Well-being&#x201D; with the subthemes of &#x201C;Personal sacrifices and concerns for personal safety,&#x201D; and &#x201C;Need for better protection and compensation.&#x201D; However, the second subtheme was generated based on a single post that mentions hazard pay: &#x201C;It would be nice if hospitals offered hazard pay, but I&#x2019;m sure they&#x2019;re also hurting financially given all of the new measures they&#x2019;re having to put into place. [&#x2026;]; many are losing a lot of anticipated revenue because they&#x2019;ve canceled their non-emergency surgeries.&#x201D; There is insufficient evidence to support the inclusion of this theme.</p><p>Another topic ChatGPT themed as &#x201C;Challenges and Considerations in Nursing and Healthcare&#x201D; with the subthemes of &#x201C;Trust and Distrust in Healthcare&#x201D; and &#x201C;Disparities in Healthcare.&#x201D; Although these are considered consistent with theme, the first subtheme is based on a post highlighting the impact of past negative experiences on trust, and the second subtheme is described by ChatGPT as emphasizing the importance of recognizing and addressing disparities that affect various groups, such as gender, age, ethnicity, and socioeconomic status; however, it is based on the following post: &#x201C;There are disparities... People we love and care about. Yes, I think it&#x2019;s important to identify areas that are of particular concern and groups that are especially vulnerable. We need to learn and use that knowledge to try to improve our collective future.&#x201D;</p><p>A total of 2 topics were classified as low coherence, which agreed with the corresponding &#x201C;low coherence&#x201D; human theme designation. ChatGPT themed 1 topic as &#x201C;Medications and Health Concerns&#x201D; with the subthemes of &#x201C;Medication Switch and COVID-19,&#x201D; &#x201C;Casual Conversations and Expressions,&#x201D; and &#x201C;Concern and Well-Wishes for Health,&#x201D; yet recognized as low coherence. The second topic, ChatGPT themed as &#x201C;Controversial Issues in Healthcare&#x201D; and has subthemes of &#x201C;Use of Hydroxychloroquine for COVID-19 Treatment,&#x201D; &#x201C;Systemic Racism and Police Brutality,&#x201D; and &#x201C;Challenges in Ensuring Compliance with Infection Control Measures.&#x201D;</p></sec><sec id="s3-4"><title>Complementarity</title><p>The analysis of complementarity is also provided in <xref ref-type="table" rid="table4">Table 4</xref>. For scale items 1, 2, and 4 the expectation was that the subthemes provide complementarity to the human-generated theme and the results meet expectations (98% (93/94), 96% (97/101), and 100% (15/15), respectively). For example, one topic with the human-generated theme of &#x201C;Testing policies in different settings&#x201D; was associated with the ChatGPT subthemes of &#x201C;Allocation of Testing Resources,&#x201D; &#x201C;Flaws in Testing Systems,&#x201D; and &#x201C;Impact on Public Health and Society.&#x201D; The first subtheme is about whether testing availability should be prioritized for hospitalized patients or health care workers, but the second subtheme highlights significant complementary issues with regards to flaws in the CDC&#x2019;s COVID-19 testing protocols, delays in fixing the tests, and the impact on the ability to detect and track the spread of the virus. The third theme expanded further into the social implications of the impact of inadequate testing resources, limited testing on the perception of the virus&#x2019;s severity, and the potential spread of the virus due to lack of testing and preventive measures.</p><p>Conversely, the expectation for the agreement scale item 5 was that complementarity would be low, yet ChatGPT found 87% complementarity. For instance, in the example mentioned above, the topic labeled by human-coders as &#x201C;Knowledge about virus,&#x201D; the subthemes are considered divergent (&#x201C;Importance of proper PPE and testing for healthcare workers&#x201D;) and too broad (&#x201C;Concerns and challenges in healthcare settings and home care&#x201D;) when compared with the scope defined by &#x201C;knowledge about the virus.&#x201D; The posts on these themes cover diverse topics such as the importance of proper personal protective equipment (PPE), concerns about testing and returning to work, the potential risks involved in home care, questions about Health Insurance Portability and Accountability Act (HIPAA) regulations, and the need for research on treatment options. The complementarity of themes only exists in a very broad sense and can be considered as &#x201C;out of context.&#x201D;</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our study offers several significant insights into the use of ChatGPT for the augmentation of topic models. A key finding is the importance of considering different levels of abstraction in theme analysis. The division into themes and subthemes is crucial for uncovering specific nuances, addressing the risk of overgeneralization inherent in LLMs.</p><p>Furthermore, our exploration of subthemes reveals that LLMs, in general, can resolve ambiguity, aiding in the clear categorization of topics, even from a limited dataset. The effective handling of &#x201C;low-coherence&#x201D; topics such as &#x201C;health disparities&#x201D; and the complementary insights provided on subthemes of &#x201C;Testing policies in different settings&#x201D; demonstrate the LLM&#x2019;s proficiency in navigating and categorizing complex subject matter at the subtheme level.</p><p>In terms of overall reliability, our study estimates a 79.7% (247/310) agreement level, positioning it at the high end of substantial agreement (60%-80%) and the low end of almost perfect agreement (80%-100%) on the intercoder reliability benchmark scale. This suggests a robust level of agreement between human coders and the LLM, indicating a reliable consistency in the classification of topics.</p><p>However, the examination of alignment and convergence reveals a nuanced aspect of LLM performance. While LLMs exhibit high accuracy in identifying alignment and convergence for topics classified by human analysis as aligned, a notable challenge arises when classifying divergent subthemes. The LLM tends to classify divergent subthemes as convergent, particularly when one of the subthemes converges in similar ideas, leading to a potential misrepresentation of thematic divergence.</p><p>The evaluation of coherence, yields an unexpected result, highlighting the issue of &#x201C;overfitting.&#x201D; Specifically, topics classified as coherent by the LLM contradict human coders&#x2019; assessments of low coherence. This suggests a potential challenge where ChatGPT may force-fit solutions that match specific data points (posts) but are &#x201C;too good to be true&#x201D; from a pattern standpoint, lacking the broader pattern consistency expected in thematic coherence. ChatGPT may be construing the theme based on the wealth of data at its disposal.</p><p>The analysis of complementarity confirms that LLMs identify subthemes that provide additional insights to themes in human researchers&#x2019; findings. LLMs can successfully identify niche topics, showcasing their potential to uncover unique thematic elements.</p><p>Our study emphasizes the critical importance of providing adequate contextual framing to ChatGPT-based classification. The challenge of lack of context becomes apparent, as LLMs may misinterpret or overlook certain topic nuances without external knowledge or the ability to track long-term context.</p></sec><sec id="s4-2"><title>Limitations</title><p>The study is limited by (1) our focus on a single social media source and (2) the LLM used. First, we focus on data from a single nurse forum, but future inclusion of additional social media sites, including those used in other countries and by users who speak other languages, may enhance the results reported here. Furthermore, while we used the OpenAI chat completion API (GPT-3.5 and GPT-4) for thematic analysis due to its accessibility to the research team, other language models have since emerged. These newer models should be tested to determine if they perform better in different contexts. Furthermore, we kept the LLM prompts as simple as possible to demonstrate that even using a simple approach the generative AI could produce solid results. Further work can apply fine tuning to prompting and design approaches to enhance the thematic analysis capabilities of LLMs, such as retrieval-augmented generation (RAG). Finally, we focus on inductive thematic analysis and short form content data. We recognize that long-form text data may pose distinct challenges in applying LLMs.</p></sec><sec id="s4-3"><title>Implications</title><p>For the LLM challenges found in this study, such as overgeneralization and overfitting, future study may apply different guardrails, such as implement algorithms that detect and mitigate biases during both training and generation phases. These guardrails monitor and filter the outputs of LLMs addressing different requirements such as hallucinations in LLM outputs [<xref ref-type="bibr" rid="ref42">42</xref>].</p><p>Future research could investigate the potential of feeding raw transcripts into ChatGPT and incorporating AI-generated themes into triangulation discussions. By contributing to triangulation, this approach promises to unveil potential oversights, present alternative perspectives, and highlight inherent researchers&#x2019; personal biases. By seamlessly incorporating AI into the discourse analysis process, researchers may uncover a richer understanding of the subject matter, fostering a more comprehensive and nuanced exploration of diverse perspectives. This integration not only enhances the depth of analysis but also provides a valuable tool for refining methodologies and mitigating potential biases, ultimately contributing to the advancement of research methodologies in the burgeoning field of AI-driven discourse analysis.</p></sec><sec id="s4-4"><title>Conclusions</title><p>Overall, this study underscores the multifaceted nature of using ChatGPT for thematic analysis, acknowledging both its strengths and challenges. The insights gained contribute to a more nuanced understanding of the capabilities and limitations of LLMs in handling complex topical data in the healthcare field, offering valuable considerations for future research in the intersection of artificial intelligence and discourse analysis.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb4">LDA</term><def><p>latent Dirichlet allocation</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb7">NLTK</term><def><p>Natural Language Toolkit</p></def></def-item><def-item><term id="abb8">PPE</term><def><p>personal protective equipment</p></def></def-item><def-item><term id="abb9">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hussain</surname><given-names>MI</given-names> </name><name name-style="western"><surname>Figueiredo</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>BD</given-names> </name><etal/></person-group><article-title>A scoping review of qualitative research in JAMIA: past contributions and opportunities for future work</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>02</month><day>15</day><volume>28</volume><issue>2</issue><fpage>402</fpage><lpage>413</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa179</pub-id><pub-id pub-id-type="medline">33225361</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ranade-Kharkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Weir</surname><given-names>C</given-names> </name><name name-style="western"><surname>Norlin</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Information needs of physicians, care coordinators, and families to support care coordination of children and youth with special health care needs (CYSHCN)</article-title><source>J Am Med Inform Assoc</source><year>2017</year><month>09</month><day>1</day><volume>24</volume><issue>5</issue><fpage>933</fpage><lpage>941</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocx023</pub-id><pub-id pub-id-type="medline">28371887</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scharp</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hobensack</surname><given-names>M</given-names> </name><name name-style="western"><surname>Davoudi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Topaz</surname><given-names>M</given-names> </name></person-group><article-title>Natural language processing applied to clinical documentation in post-acute care settings: a scoping review</article-title><source>J Am Med Dir Assoc</source><year>2024</year><month>01</month><volume>25</volume><issue>1</issue><fpage>69</fpage><lpage>83</lpage><pub-id pub-id-type="doi">10.1016/j.jamda.2023.09.006</pub-id><pub-id pub-id-type="medline">37838000</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Castellanos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Castillo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gomes</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>VanderMeer</surname><given-names>D</given-names> </name></person-group><article-title>Nurses&#x2019; work concerns and disenchantment during the COVID-19 pandemic: machine learning analysis of web-based discussions</article-title><source>JMIR Nurs</source><year>2023</year><month>02</month><day>6</day><volume>6</volume><fpage>e40676</fpage><pub-id pub-id-type="doi">10.2196/40676</pub-id><pub-id pub-id-type="medline">36608261</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hobensack</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ojo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Barr&#x00F3;n</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Documentation of hospitalization risk factors in electronic health records (EHRs): a qualitative study with home healthcare clinicians</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>04</month><day>13</day><volume>29</volume><issue>5</issue><fpage>805</fpage><lpage>812</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac023</pub-id><pub-id pub-id-type="medline">35196369</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>WX</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>K</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 31, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.18223</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x00C4;lg&#x00E5;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Eriksson</surname><given-names>O</given-names> </name><name name-style="western"><surname>Nordberg</surname><given-names>M</given-names> </name></person-group><article-title>Analysis of scientific publications during the early phase of the COVID-19 pandemic: topic modeling study</article-title><source>J Med Internet Res</source><year>2020</year><month>11</month><day>10</day><volume>22</volume><issue>11</issue><fpage>e21559</fpage><pub-id pub-id-type="doi">10.2196/21559</pub-id><pub-id pub-id-type="medline">33031049</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chandrasekaran</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mehta</surname><given-names>V</given-names> </name><name name-style="western"><surname>Valkunde</surname><given-names>T</given-names> </name><name name-style="western"><surname>Moustakas</surname><given-names>E</given-names> </name></person-group><article-title>Topics, trends, and sentiments of tweets about the COVID-19 pandemic: temporal infoveillance study</article-title><source>J Med Internet Res</source><year>2020</year><month>10</month><day>23</day><volume>22</volume><issue>10</issue><fpage>e22624</fpage><pub-id pub-id-type="doi">10.2196/22624</pub-id><pub-id pub-id-type="medline">33006937</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kavvadias</surname><given-names>S</given-names> </name><name name-style="western"><surname>Drosatos</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kaldoudi</surname><given-names>E</given-names> </name></person-group><article-title>Supporting topic modeling and trends analysis in biomedical literature</article-title><source>J Biomed Inform</source><year>2020</year><month>10</month><volume>110</volume><fpage>103574</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2020.103574</pub-id><pub-id pub-id-type="medline">32971274</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jelodar</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Latent Dirichlet allocation (LDA) and topic modeling: models, applications, a survey</article-title><source>Multimed Tools Appl</source><year>2019</year><month>06</month><volume>78</volume><issue>11</issue><fpage>15169</fpage><lpage>15211</lpage><pub-id pub-id-type="doi">10.1007/s11042-018-6894-4</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Buenano-Fernandez</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gonzalez</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gil</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lujan-Mora</surname><given-names>S</given-names> </name></person-group><article-title>Text mining of open-ended questions in self-assessment of university teachers: an LDA topic modeling approach</article-title><source>IEEE Access</source><year>2020</year><volume>8</volume><fpage>35318</fpage><lpage>35330</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2020.2974983</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Ibrahim</surname><given-names>EI</given-names> </name><name name-style="western"><surname>Voyer</surname><given-names>A</given-names> </name></person-group><article-title>The augmented qualitative researcher: using generative AI in qualitative text analysis</article-title><source>SocArXiv</source><comment>Preprint posted online on  Jan 22, 2024</comment><pub-id pub-id-type="doi">10.31235/osf.io/gkc8w</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilardi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Alizadeh</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kubli</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT outperforms crowd workers for text-annotation tasks</article-title><source>Proc Natl Acad Sci U S A</source><year>2023</year><month>07</month><day>25</day><volume>120</volume><issue>30</issue><fpage>e2305016120</fpage><pub-id pub-id-type="doi">10.1073/pnas.2305016120</pub-id><pub-id pub-id-type="medline">37463210</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morgan</surname><given-names>DL</given-names> </name></person-group><article-title>Exploring the use of artificial intelligence for qualitative data analysis: the case of ChatGPT</article-title><source>Int J Qual Methods</source><year>2023</year><month>10</month><volume>22</volume><fpage>16094069231211248</fpage><pub-id pub-id-type="doi">10.1177/16094069231211248</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hamilton</surname><given-names>L</given-names> </name><name name-style="western"><surname>Elliott</surname><given-names>D</given-names> </name><name name-style="western"><surname>Quick</surname><given-names>A</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>S</given-names> </name><name name-style="western"><surname>Choplin</surname><given-names>V</given-names> </name></person-group><article-title>Exploring the use of AI in qualitative analysis: a comparative study of guaranteed income data</article-title><source>Int J Qual Methods</source><year>2023</year><month>10</month><volume>22</volume><pub-id pub-id-type="doi">10.1177/16094069231201504</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><comment>Preprint posted online on  Oct 11, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>N</given-names> </name><name name-style="western"><surname>Frieske</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Survey of hallucination in natural language generation</article-title><source>ACM Comput Surv</source><year>2023</year><month>12</month><day>31</day><volume>55</volume><issue>12</issue><fpage>1</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1145/3571730</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Srivastava</surname><given-names>N</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name><name name-style="western"><surname>Krizhevsky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name><name name-style="western"><surname>Salakhutdinov</surname><given-names>R</given-names> </name></person-group><article-title>Dropout: a simple way to prevent neural networks from overfitting</article-title><source>J Mach Learn Res</source><year>2014</year><volume>15</volume><fpage>1929</fpage><lpage>1958</lpage><pub-id pub-id-type="doi">10.5555/2627435.2670313</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Xue</surname><given-names>F</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>You</surname><given-names>Y</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Oh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Naumann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Globerson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Saenko</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hardt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Levine</surname><given-names>S</given-names> </name></person-group><article-title>To repeat or not to repeat: insights from scaling LLM under token-crisis</article-title><source>NIPS &#x2019;23: Proceedings of the 37th International Conference on Neural Information Processing System</source><year>2023</year><volume>36</volume><publisher-name>Curran Associates</publisher-name><fpage>59304</fpage><lpage>59322</lpage><pub-id pub-id-type="other">9781713899921</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Peters</surname><given-names>M</given-names> </name><name name-style="western"><surname>Neumann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Iyyer</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Deep contextualized word representations</article-title><source>Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)</source><year>2018</year><publisher-name>Association for Computational Linguistics</publisher-name><pub-id pub-id-type="doi">10.18653/v1/N18-1202</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bolukbasi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Saligrama</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kalai</surname><given-names>AT</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Lee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sugiyama</surname><given-names>M</given-names> </name><name name-style="western"><surname>Luxburg</surname><given-names>U</given-names> </name><name name-style="western"><surname>Guyon</surname><given-names>I</given-names> </name><name name-style="western"><surname>Garnett</surname><given-names>R</given-names> </name></person-group><article-title>Man is to computer programmer as woman is to homemaker? Debiasing word embeddings</article-title><source>NIPS&#x2019;16: Proceedings of the 30th International Conference on Neural Information Processing Systems</source><year>2016</year><publisher-name>Curran Associates</publisher-name><fpage>4356</fpage><lpage>4364</lpage><pub-id pub-id-type="doi">10.5555/3157382.3157584</pub-id><pub-id pub-id-type="other">9781510838819</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>KF</given-names> </name></person-group><article-title>Investigating bias in LLM-based bias detection disparities between llms and human perception</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 22, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.14896</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Griffiths</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tenenbaum</surname><given-names>J</given-names> </name><name name-style="western"><surname>Blei</surname><given-names>D</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Thrun</surname><given-names>S</given-names> </name><name name-style="western"><surname>Saul</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sch&#x00F6;lkopf</surname><given-names>B</given-names> </name></person-group><article-title>Hierarchical topic models and the nested chinese restaurant process</article-title><source>Advances in Neural Information Processing Systems</source><year>2003</year><access-date>2025-03-27</access-date><volume>16</volume><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2003/file/7b41bfa5085806dfa24b8c9de0ce567f-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2003/file/7b41bfa5085806dfa24b8c9de0ce567f-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hajikhani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cole</surname><given-names>C</given-names> </name></person-group><article-title>A critical review of large language models: sensitivity, bias, and the path toward specialized AI</article-title><source>Quant Sci Stud</source><year>2024</year><month>08</month><day>1</day><volume>5</volume><issue>3</issue><fpage>736</fpage><lpage>756</lpage><pub-id pub-id-type="doi">10.1162/qss_a_00310</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hadsell</surname><given-names>R</given-names> </name><name name-style="western"><surname>Balcan</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name></person-group><article-title>Language models are few-shot learners</article-title><source>Advances in Neural Information Processing Systems</source><year>2020</year><volume>33</volume><publisher-name>Curran Associates</publisher-name><pub-id pub-id-type="other">9781713829546</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amershi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cakmak</surname><given-names>M</given-names> </name><name name-style="western"><surname>Knox</surname><given-names>WB</given-names> </name><name name-style="western"><surname>Kulesza</surname><given-names>T</given-names> </name></person-group><article-title>Power to the people: the role of humans in interactive machine learning</article-title><source>AI Mag</source><year>2014</year><month>12</month><volume>35</volume><issue>4</issue><fpage>105</fpage><lpage>120</lpage><pub-id pub-id-type="doi">10.1609/aimag.v35i4.2513</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="thesis"><person-group person-group-type="author"><name name-style="western"><surname>Koch</surname><given-names>MA</given-names> </name></person-group><article-title>Turning chaos into meaning: a ChatGPT-assisted exploration of COVID-19 narratives [Master&#x2019;s thesis]</article-title><year>2023</year><access-date>2025-03-27</access-date><publisher-name>University of Twente</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://purl.utwente.nl/essays/96885">https://purl.utwente.nl/essays/96885</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Mesec</surname><given-names>B</given-names> </name></person-group><article-title>The language model of artificial inteligence ChatGPT - a tool of qualitative analysis of texts</article-title><source>Authorea</source><comment>Preprint posted online on  Apr 18, 2023</comment><pub-id pub-id-type="doi">10.22541/au.168182047.70243364/v1</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Braun</surname><given-names>V</given-names> </name><name name-style="western"><surname>Clarke</surname><given-names>V</given-names> </name></person-group><article-title>Thematic analysis: a practical guide</article-title><source>Adv Neural Inf Process Syst</source><year>2021</year><publisher-name>MIT Press</publisher-name><pub-id pub-id-type="other">1473953235</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Proudfoot</surname><given-names>K</given-names> </name></person-group><article-title>Inductive/deductive hybrid thematic analysis in mixed methods research</article-title><source>J Mix Methods Res</source><year>2023</year><month>07</month><volume>17</volume><issue>3</issue><fpage>308</fpage><lpage>326</lpage><pub-id pub-id-type="doi">10.1177/15586898221126816</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Rehurek</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sojka</surname><given-names>P</given-names> </name></person-group><article-title>Gensim&#x2013;Python framework for vector space modelling</article-title><year>2011</year><access-date>2025-03-27</access-date><publisher-name>NLP Centre, Faculty of Informatics, Masaryk University</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.fi.muni.cz/usr/sojka/posters/rehurek-sojka-scipy2011.pdf">https://www.fi.muni.cz/usr/sojka/posters/rehurek-sojka-scipy2011.pdf</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Srinivasa-Desikan</surname><given-names>B</given-names> </name></person-group><source>Natural Language Processing and Computational Linguistics: A Practical Guide to Text Analysis with Python, Gensim, Spacy, and Keras</source><year>2018</year><publisher-name>Packt Publishing Ltd</publisher-name><fpage>978</fpage><pub-id pub-id-type="other">1788838535</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Stevens</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kegelmeyer</surname><given-names>P</given-names> </name><name name-style="western"><surname>Andrzejewski</surname><given-names>D</given-names> </name><name name-style="western"><surname>Buttler</surname><given-names>D</given-names> </name></person-group><article-title>Exploring topic coherence over many models and many topics</article-title><conf-name>2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning</conf-name><conf-date>Jul 12-14, 2012</conf-date><conf-loc>Jeju Island, Korea</conf-loc></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>FF</given-names> </name><name name-style="western"><surname>Alon</surname><given-names>U</given-names> </name><name name-style="western"><surname>Neubig</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hellendoorn</surname><given-names>VJ</given-names> </name></person-group><article-title>A systematic evaluation of large language models of code</article-title><conf-name>MAPS &#x2019;22: 6th ACM SIGPLAN International Symposium on Machine Programming</conf-name><conf-date>Jun 13, 2022</conf-date><conf-loc>San Diego, CA, United States</conf-loc><pub-id pub-id-type="doi">10.1145/3520312.3534862</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Neuendorf</surname><given-names>KA</given-names> </name></person-group><source>The Content Analysis Guidebook</source><year>2017</year><edition>2</edition><publisher-name>SAGE</publisher-name><pub-id pub-id-type="doi">10.4135/9781071802878</pub-id><pub-id pub-id-type="other">9781071802878</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Salda&#x00F1;a</surname><given-names>J</given-names> </name></person-group><source>The Coding Manual for Qualitative Researchers</source><year>2021</year><edition>4</edition><publisher-name>SAGE</publisher-name><pub-id pub-id-type="other">9781473902497</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pickering</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Garrod</surname><given-names>S</given-names> </name></person-group><article-title>Toward a mechanistic psychology of dialogue</article-title><source>Behav Brain Sci</source><year>2004</year><month>04</month><volume>27</volume><issue>2</issue><fpage>169</fpage><lpage>190</lpage><pub-id pub-id-type="doi">10.1017/s0140525x04000056</pub-id><pub-id pub-id-type="medline">15595235</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>van Dijk</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Kintsch</surname><given-names>W</given-names> </name></person-group><source>Strategies of Discourse Comprehension</source><year>1983</year><publisher-name>Academic Press</publisher-name><pub-id pub-id-type="other">0127120505</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>HH</given-names> </name><name name-style="western"><surname>Brennan</surname><given-names>SE</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Resnick</surname><given-names>LB</given-names> </name><name name-style="western"><surname>Levine</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Teasley</surname><given-names>SD</given-names> </name></person-group><article-title>Grounding in communication</article-title><source>Perspectives on Socially Shared Cognition</source><year>1991</year><publisher-name>American Psychological Association</publisher-name><fpage>127</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1037/10096-006</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="book"><person-group person-group-type="editor"><name name-style="western"><surname>Gernsbacher</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Giv&#x00F3;n</surname><given-names>T</given-names> </name></person-group><source>Coherence in Spontaneous Text Papers Presented at the Symposium on Coherence in Spontaneous Text</source><year>1992</year><publisher-name>University of Oregon</publisher-name></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Building guardrails for large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 2, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.01822</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Large language model&#x2019;s use for thematic analysis and classifying agreements.</p><media xlink:href="ai_v4i1e64447_app1.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material></app-group></back></article>