@Article{info:doi/10.2196/70566, author="AlFarabi Ali, Sarah and AlDehlawi, Hebah and Jazzar, Ahoud and Ashi, Heba and Esam Abuzinadah, Nihal and AlOtaibi, Mohammad and Algarni, Abdulrahman and Alqahtani, Hazzaa and Akeel, Sara and Almazrooa, Soulafa", title="The Diagnostic Performance of Large Language Models and Oral Medicine Consultants for Identifying Oral Lesions in Text-Based Clinical Scenarios: Prospective Comparative Study", journal="JMIR AI", year="2025", month="Apr", day="24", volume="4", pages="e70566", keywords="artificial intelligence", keywords="ChatGPT", keywords="Copilot", keywords="diagnosis", keywords="oral medicine", keywords="diagnostic performance", keywords="large language model", keywords="lesion", keywords="oral lesion", abstract="Background: The use of artificial intelligence (AI), especially large language models (LLMs), is increasing in health care, including in dentistry. There has yet to be an assessment of the diagnostic performance of LLMs in oral medicine. Objective: We aimed to compare the effectiveness of ChatGPT (OpenAI) and Microsoft Copilot (integrated within the Microsoft 365 suite) with oral medicine consultants in formulating accurate differential and final diagnoses for oral lesions from written clinical scenarios. Methods: Fifty comprehensive clinical case scenarios including patient age, presenting complaint, history of the presenting complaint, medical history, allergies, intra- and extraoral findings, lesion description, and any additional information including laboratory investigations and specific clinical features were given to three oral medicine consultants, who were asked to formulate a differential diagnosis and a final diagnosis. Specific prompts for the same 50 cases were designed and input into ChatGPT and Copilot to formulate both differential and final diagnoses. The diagnostic accuracy was compared between the LLMs and oral medicine consultants. Results: ChatGPT exhibited the highest accuracy, providing the correct differential diagnoses in 37 of 50 cases (74\%). There were no significant differences in the accuracy of providing the correct differential diagnoses between AI models and oral medicine consultants. ChatGPT was as accurate as consultants in making the final diagnoses, but Copilot was significantly less accurate than ChatGPT (P=.015) and one of the oral medicine consultants (P<.001) in providing the correct final diagnosis. Conclusions: ChatGPT and Copilot show promising performance for diagnosing oral medicine pathology in clinical case scenarios to assist dental practitioners. ChatGPT-4 and Copilot are still evolving, but even now, they might provide a significant advantage in the clinical setting as tools to help dental practitioners in their daily practice. ", doi="10.2196/70566", url="https://ai.jmir.org/2025/1/e70566" } @Article{info:doi/10.2196/67481, author="Tandon, Mihir and Chetla, Nitin and Mallepally, Adarsh and Zebari, Botan and Samayamanthula, Sai and Silva, Jonathan and Vaja, Swapna and Chen, John and Cullen, Matthew and Sukhija, Kunal", title="Can Artificial Intelligence Diagnose Knee Osteoarthritis?", journal="JMIR Biomed Eng", year="2025", month="Apr", day="23", volume="10", pages="e67481", keywords="large language model", keywords="ChatGPT", keywords="GPT-4o", keywords="radiology", keywords="osteoarthritis", keywords="machine learning", keywords="X-rays", keywords="osteoarthritis detection", doi="10.2196/67481", url="https://biomedeng.jmir.org/2025/1/e67481", url="http://www.ncbi.nlm.nih.gov/pubmed/40266670" } @Article{info:doi/10.2196/58454, author="Leitner, Kirstin and Cutri-French, Clare and Mandel, Abigail and Christ, Lori and Koelper, Nathaneal and McCabe, Meaghan and Seltzer, Emily and Scalise, Laura and Colbert, A. James and Dokras, Anuja and Rosin, Roy and Levine, Lisa", title="A Conversational Agent Using Natural Language Processing for Postpartum Care for New Mothers: Development and Engagement Analysis", journal="JMIR AI", year="2025", month="Apr", day="22", volume="4", pages="e58454", keywords="conversational agent", keywords="postpartum care", keywords="text messaging", keywords="postpartum", keywords="natural language processing", keywords="pregnancy", keywords="parents", keywords="newborns", keywords="development", keywords="patient engagement", keywords="physical recovery", keywords="infant", keywords="infant care", keywords="survey", keywords="breastfeeding", keywords="support", keywords="patient support", keywords="patient satisfaction", abstract="Background: The ``fourth trimester,'' or postpartum time period, remains a critical phase of pregnancy that significantly impacts parents and newborns. Care poses challenges due to complex individual needs as well as low attendance rates at routine appointments. A comprehensive technological solution could provide a holistic and equitable solution to meet care goals. Objective: This paper describes the development of patient engagement data with a novel postpartum conversational agent that uses natural language processing to support patients post partum. Methods: We report on the development of a postpartum conversational agent from concept to usable product as well as the patient engagement with this technology. Content for the program was developed using patient- and provider-based input and clinical algorithms. Our program offered 2-way communication to patients and details on physical recovery, lactation support, infant care, and warning signs for problems. This was iterated upon by our core clinical team and an external expert clinical panel before being tested on patients. Patients eligible for discharge around 24 hours after delivery who had delivered a singleton full-term infant vaginally were offered use of the program. Patient demographics, accuracy, and patient engagement were collected over the first 6 months of use. Results: A total of 290 patients used our conversational agent over the first 6 months, of which 112 (38.6\%) were first time parents and 162 (56\%) were Black. In total, 286 (98.6\%) patients interacted with the platform at least once, 271 patients (93.4\%) completed at least one survey, and 151 (52\%) patients asked a question. First time parents and those breastfeeding their infants had higher rates of engagement overall. Black patients were more likely to promote the program than White patients (P=.047). The overall accuracy of the conversational agent during the first 6 months was 77\%. Conclusions: It is possible to develop a comprehensive, automated postpartum conversational agent. The use of such a technology to support patients postdischarge appears to be acceptable with very high engagement and patient satisfaction. ", doi="10.2196/58454", url="https://ai.jmir.org/2025/1/e58454" } @Article{info:doi/10.2196/68960, author="Wu, Xiaoli and Liew, Kongmeng and Dorahy, J. Martin", title="Trust, Anxious Attachment, and Conversational AI Adoption Intentions in Digital Counseling: A Preliminary Cross-Sectional Questionnaire Study", journal="JMIR AI", year="2025", month="Apr", day="22", volume="4", pages="e68960", keywords="attachment style", keywords="conversational artificial intelligence", keywords="CAI", keywords="perceived trust", keywords="adoption intentions", keywords="CAI counseling", keywords="mobile phone", abstract="Background: Conversational artificial intelligence (CAI) is increasingly used in various counseling settings to deliver psychotherapy, provide psychoeducational content, and offer support like companionship or emotional aid. Research has shown that CAI has the potential to effectively address mental health issues when its associated risks are handled with great caution. It can provide mental health support to a wider population than conventional face-to-face therapy, and at a faster response rate and more affordable cost. Despite CAI's many advantages in mental health support, potential users may differ in their willingness to adopt and engage with CAI to support their own mental health. Objective: This study focused specifically on dispositional trust in AI and attachment styles, and examined how they are associated with individuals' intentions to adopt CAI for mental health support. Methods: A cross-sectional survey of 239 American adults was conducted. Participants were first assessed on their attachment style, then presented with a vignette about CAI use, after which their dispositional trust and subsequent adoption intentions toward CAI counseling were surveyed. Participants had not previously used CAI for digital counseling for mental health support. Results: Dispositional trust in artificial intelligence emerged as a critical predictor of CAI adoption intentions (P<.001), while attachment anxiety (P=.04), rather than avoidance (P=.09), was found to be positively associated with the intention to adopt CAI counseling after controlling for age and gender. Conclusions: These findings indicated higher dispositional trust might lead to stronger adoption intention, and higher attachment anxiety might also be associated with greater CAI counseling adoption. Further research into users' attachment styles and dispositional trust is needed to understand individual differences in CAI counseling adoption for enhancing the safety and effectiveness of CAI-driven counseling services and tailoring interventions. Trial Registration: Open Science Framework; https://osf.io/c2xqd ", doi="10.2196/68960", url="https://ai.jmir.org/2025/1/e68960" } @Article{info:doi/10.2196/66491, author="Park, Soo Ji and Park, Sa-Yoon and Moon, Won Jae and Kim, Kwangsoo and Suh, In Dong", title="Artificial Intelligence Models for Pediatric Lung Sound Analysis: Systematic Review and Meta-Analysis", journal="J Med Internet Res", year="2025", month="Apr", day="18", volume="27", pages="e66491", keywords="machine learning", keywords="respiratory disease classification", keywords="wheeze detection", keywords="auscultation", keywords="mel-spectrogram", keywords="abnormal lung sound detection", keywords="artificial intelligence", keywords="pediatric", keywords="lung sound analysis", keywords="systematic review", keywords="asthma", keywords="pneumonia", keywords="children", keywords="morbidity", keywords="mortality", keywords="diagnostic", keywords="respiratory pathology", abstract="Background: Pediatric respiratory diseases, including asthma and pneumonia, are major causes of morbidity and mortality in children. Auscultation of lung sounds is a key diagnostic tool but is prone to subjective variability. The integration of artificial intelligence (AI) and machine learning (ML) with electronic stethoscopes offers a promising approach for automated and objective lung sound. Objective: This systematic review and meta-analysis assess the performance of ML models in pediatric lung sound analysis. The study evaluates the methodologies, model performance, and database characteristics while identifying limitations and future directions for clinical implementation. Methods: A systematic search was conducted in Medline via PubMed, Embase, Web of Science, OVID, and IEEE Xplore for studies published between January 1, 1990, and December 16, 2024. Inclusion criteria are as follows: studies developing ML models for pediatric lung sound classification with a defined database, physician-labeled reference standard, and reported performance metrics. Exclusion criteria are as follows: studies focusing on adults, cardiac auscultation, validation of existing models, or lacking performance metrics. Risk of bias was assessed using a modified Quality Assessment of Diagnostic Accuracy Studies (version 2) framework. Data were extracted on study design, dataset, ML methods, feature extraction, and classification tasks. Bivariate meta-analysis was performed for binary classification tasks, including wheezing and abnormal lung sound detection. Results: A total of 41 studies met the inclusion criteria. The most common classification task was binary detection of abnormal lung sounds, particularly wheezing. Pooled sensitivity and specificity for wheeze detection were 0.902 (95\% CI 0.726-0.970) and 0.955 (95\% CI 0.762-0.993), respectively. For abnormal lung sound detection, pooled sensitivity was 0.907 (95\% CI 0.816-0.956) and specificity 0.877 (95\% CI 0.813-0.921). The most frequently used feature extraction methods were Mel-spectrogram, Mel-frequency cepstral coefficients, and short-time Fourier transform. Convolutional neural networks were the predominant ML model, often combined with recurrent neural networks or residual network architectures. However, high heterogeneity in dataset size, annotation methods, and evaluation criteria were observed. Most studies relied on small, single-center datasets, limiting generalizability. Conclusions: ML models show high accuracy in pediatric lung sound analysis, but face limitations due to dataset heterogeneity, lack of standard guidelines, and limited external validation. Future research should focus on standardized protocols and the development of large-scale, multicenter datasets to improve model robustness and clinical implementation. ", doi="10.2196/66491", url="https://www.jmir.org/2025/1/e66491" } @Article{info:doi/10.2196/63130, author="Ferr{\'e}, Fabrice and Allassonni{\`e}re, St{\'e}phanie and Chadebec, Cl{\'e}ment and Minville, Vincent", title="Generating Artificial Patients With Reliable Clinical Characteristics Using a Geometry-Based Variational Autoencoder: Proof-of-Concept Feasibility Study", journal="J Med Internet Res", year="2025", month="Apr", day="17", volume="27", pages="e63130", keywords="digital health", keywords="artificial data", keywords="variational autoencoder", keywords="data science", keywords="artificial intelligence", keywords="health monitoring", keywords="deep learning", keywords="medical imaging", keywords="imaging", keywords="magnetic resonance imaging", keywords="Alzheimer disease", keywords="anesthesia", keywords="prediction", keywords="data augmentation", abstract="Background: Artificial patient technology could transform health care by accelerating diagnosis, treatment, and mapping clinical pathways. Deep learning methods for generating artificial data in health care include data augmentation by variational autoencoders (VAE) technology. Objective: We aimed to test the feasibility of generating artificial patients with reliable clinical characteristics by using a geometry-based VAE applied, for the first time, on high-dimension, low-sample-size tabular data. Methods: Clinical tabular data were extracted from 521 real patients of the ``MAX'' digital conversational agent (BOTdesign) created for preparing patients for anesthesia. A 3-stage methodological approach was implemented to generate up to 10,000 artificial patients: training the model and generating artificial data, assessing the consistency and confidentiality of artificial data, and validating the plausibility of the newly created artificial patients. Results: We demonstrated the feasibility of applying the VAE technique to tabular data to generate large artificial patient cohorts with high consistency (fidelity scores>94\%). Moreover, artificial patients could not be matched with real patients (filter similarity scores>99\%, $\kappa$ coefficients of agreement<0.2), thus guaranteeing the essential ethical concern of confidentiality. Conclusions: This proof-of-concept study has demonstrated our ability to augment real tabular data to generate artificial patients. These promising results make it possible to envisage in silico trials carried out on large cohorts of artificial patients, thereby overcoming the pitfalls usually encountered in in vivo trials. Further studies integrating longitudinal dynamics are needed to map patient trajectories. ", doi="10.2196/63130", url="https://www.jmir.org/2025/1/e63130" } @Article{info:doi/10.2196/59076, author="Maharjan, Julina and Zhu, Jianfeng and King, Jennifer and Phan, NhatHai and Kenne, Deric and Jin, Ruoming", title="Large-Scale Deep Learning--Enabled Infodemiological Analysis of Substance Use Patterns on Social Media: Insights From the COVID-19 Pandemic", journal="JMIR Infodemiology", year="2025", month="Apr", day="17", volume="5", pages="e59076", keywords="substance use", keywords="social media", keywords="deep learning", keywords="Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach", keywords="human-in-the-loop", keywords="COVID-19", abstract="Background: The COVID-19 pandemic intensified the challenges associated with mental health and substance use (SU), with societal and economic upheavals leading to heightened stress and increased reliance on drugs as a coping mechanism. Centers for Disease Control and Prevention data from June 2020 showed that 13\% of Americans used substances more frequently due to pandemic-related stress, accompanied by an 18\% rise in drug overdoses early in the year. Simultaneously, a significant increase in social media engagement provided unique insights into these trends. Our study analyzed social media data from January 2019 to December 2021 to identify changes in SU patterns across the pandemic timeline, aiming to inform effective public health interventions. Objective: This study aims to analyze SU from large-scale social media data during the COVID-19 pandemic, including the prepandemic and postpandemic periods as baseline and consequence periods. The objective was to examine the patterns related to a broader spectrum of drug types with underlying themes, aiming to provide a more comprehensive understanding of SU trends during the COVID-19 pandemic. Methods: We leveraged a deep learning model, Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach (RoBERTa), to analyze 1.13 billion Twitter (subsequently rebranded X) posts from January 2019 to December 2021, aiming to identify SU posts. The model's performance was enhanced by a human-in-the-loop strategy that subsequently enriched the annotated data used during the fine-tuning phase. To gain insights into SU trends over the study period, we applied a range of statistical techniques, including trend analysis, k-means clustering, topic modeling, and thematic analysis. In addition, we integrated the system into a real-time application designed for monitoring and preventing SU within specific geographic locations. Results: Our research identified 9 million SU posts in the studied period. Compared to 2019 and 2021, the most substantial display of SU-related posts occurred in 2020, with a sharp 21\% increase within 3 days of the global COVID-19 pandemic declaration. Alcohol and cannabinoids remained the most discussed substances throughout the research period. The pandemic particularly influenced the rise in nonillicit substances, such as alcohol, prescription medication, and cannabinoids. In addition, thematic analysis highlighted COVID-19, mental health, and economic stress as the leading issues that contributed to the influx of substance-related posts during the study period. Conclusions: This study demonstrates the potential of leveraging social media data for real-time detection of SU trends during global crises. By uncovering how factors such as mental health and economic stress drive SU spikes, particularly in alcohol and prescription medication, we offer crucial insights for public health strategies. Our approach paves the way for proactive, data-driven interventions that will help mitigate the impact of future crises on vulnerable populations. ", doi="10.2196/59076", url="https://infodemiology.jmir.org/2025/1/e59076", url="http://www.ncbi.nlm.nih.gov/pubmed/40244656" } @Article{info:doi/10.2196/67248, author="Carrillo, Beltran and Rubinos-Cuadrado, Marta and Parellada-Martin, Jazmin and Palacios-L{\'o}pez, Alejandra and Carrillo-Rubinos, Beltran and Canillas-Del Rey, Fernando and Bazt{\'a}n-Cortes, Jose Juan and G{\'o}mez-Pavon, Javier", title="Effectiveness of The Umbrella Collaboration Versus Traditional Umbrella Reviews for Evidence Synthesis in Health Care: Protocol for a Validation Study", journal="JMIR Res Protoc", year="2025", month="Apr", day="14", volume="14", pages="e67248", keywords="tertiary evidence synthesis", keywords="The Umbrella Collaboration", keywords="umbrella reviews", keywords="health research methodology", keywords="AI-assisted synthesis", keywords="AI-assisted", keywords="evidence-based decision making", keywords="machine learning", keywords="ML", keywords="artificial intelligence", keywords="AI", keywords="algorithms", keywords="models", keywords="analytics", keywords="digital health", keywords="digital technology", keywords="digital interventions", abstract="Background: The synthesis of evidence in health care is essential for informed decision-making and policy development. This study aims to validate The Umbrella Collaboration (TU), an innovative, semiautomatic tertiary evidence synthesis methodology, by comparing it with Traditional Umbrella Reviews (TUR), which are currently the gold standard. Objective: This study aimed to evaluate whether TU, an artificial intelligence---assisted, software-driven system for tertiary evidence synthesis, can achieve comparable effectiveness to TURs, while offering a more timely, efficient, and comprehensive approach. In addition, as a secondary objective, the study aims to assess the accessibility and comprehensibility of TU's outputs to ensure its usability and practical applicability for health care professionals. Methods: This protocol outlines a comparative study divided into 2 main parts. The first part involves a quantitative comparison of results obtained using TU and TURs in geriatrics. We will evaluate the identification, size effect, direction, statistical significance, and certainty of outcomes, as well as the time and resources required for each methodology. Data for TURs will be sourced from Medline (via PubMed), while TU will use artificial intelligence---assisted informatics to replicate the research questions of the selected TURs. The second part of the study assesses the ease of use and comprehension of TU through an online survey directed at health professionals, using interactive features and detailed data access. Results: Expected results include the assessment of concordance in identifying outcomes, the size effect, direction and significance of these outcomes, and the certainty of evidence. In addition, we will measure the operational efficiency of each methodology by evaluating the time taken to complete projects. User perceptions of the ease of use and comprehension of TU will be gathered through detailed surveys. The implementation of new methodologies in evidence synthesis requires validation. This study will determine whether TU can match the accuracy and comprehensiveness of TURs while offering benefits in terms of efficiency and user accessibility. The comparative study is designed to address the inherent challenges in validating a new methodology against established standards. Conclusions: If TU proves as effective as TURs but more time-efficient, accessible, and easily updatable, it could significantly enhance the process of evidence synthesis, facilitating informed decision-making and improving health care. This study represents a step toward integrating innovative technologies into routine evidence synthesis practice, potentially transforming health research. International Registered Report Identifier (IRRID): PRR1-10.2196/67248 ", doi="10.2196/67248", url="https://www.researchprotocols.org/2025/1/e67248", url="http://www.ncbi.nlm.nih.gov/pubmed/40057944" } @Article{info:doi/10.2196/67772, author="Su, Zhengyuan and Jiang, Huadong and Yang, Ying and Hou, Xiangqing and Su, Yanli and Yang, Li", title="Acoustic Features for Identifying Suicide Risk in Crisis Hotline Callers: Machine Learning Approach", journal="J Med Internet Res", year="2025", month="Apr", day="14", volume="27", pages="e67772", keywords="suicide", keywords="crisis hotline", keywords="acoustic feature", keywords="machine learning", keywords="acoustics", keywords="suicide risk", keywords="artificial intelligence", keywords="feasibility", keywords="prediction models", keywords="hotline callers", keywords="voice", abstract="Background: Crisis hotlines serve as a crucial avenue for the early identification of suicide risk, which is of paramount importance for suicide prevention and intervention. However, assessing the risk of callers in the crisis hotline context is constrained by factors such as lack of nonverbal communication cues, anonymity, time limits, and single-occasion intervention. Therefore, it is necessary to develop approaches, including acoustic features, for identifying the suicide risk among hotline callers early and quickly. Given the complicated features of sound, adopting artificial intelligence models to analyze callers' acoustic features is promising. Objective: In this study, we investigated the feasibility of using acoustic features to predict suicide risk in crisis hotline callers. We also adopted a machine learning approach to analyze the complex acoustic features of hotline callers, with the aim of developing suicide risk prediction models. Methods: We collected 525 suicide-related calls from the records of a psychological assistance hotline in a province in northwest China. Callers were categorized as low or high risk based on suicidal ideation, suicidal plans, and history of suicide attempts, with risk assessments verified by a team of 18 clinical psychology raters. A total of 164 clearly categorized risk recordings were analyzed, including 102 low-risk and 62 high-risk calls. We extracted 273 audio segments, each exceeding 2 seconds in duration, which were labeled by raters as containing suicide-related expressions for subsequent model training and evaluation. Basic acoustic features (eg, Mel Frequency Cepstral Coefficients, formant frequencies, jitter, shimmer) and high-level statistical function (HSF) features (using OpenSMILE [Open-Source Speech and Music Interpretation by Large-Space Extraction] with the ComParE 2016 configuration) were extracted. Four supervised machine learning algorithms (logistic regression, support vector machine, random forest, and extreme gradient boosting) were trained and evaluated using grouped 5-fold cross-validation and a test set, with performance metrics, including accuracy, F1-score, recall, and false negative rate. Results: The development of machine learning models utilizing HSF acoustic features has been demonstrated to enhance recognition performance compared to models based solely on basic acoustic features. The random forest classifier, developed with HSFs, achieved the best performance in detecting the suicide risk among the models evaluated (accuracy=0.75, F1-score=0.70, recall=0.76, false negative rate=0.24). Conclusions: The results of our study demonstrate the potential of developing artificial intelligence--based early warning systems using acoustic features for identifying the suicide risk among crisis hotline callers. Our work also has implications for employing acoustic features to identify suicide risk in salient voice contexts. ", doi="10.2196/67772", url="https://www.jmir.org/2025/1/e67772" } @Article{info:doi/10.2196/67144, author="Rahman, Mahmudur and Gao, Jifan and Carey, A. Kyle and Edelson, P. Dana and Afshar, Askar and Garrett, W. John and Chen, Guanhua and Afshar, Majid and Churpek, M. Matthew", title="Comparison of Deep Learning Approaches Using Chest Radiographs for Predicting Clinical Deterioration: Retrospective Observational Study", journal="JMIR AI", year="2025", month="Apr", day="10", volume="4", pages="e67144", keywords="chest X-ray", keywords="critical care", keywords="deep learning", keywords="chest radiographs", keywords="radiographs", keywords="clinical deterioration", keywords="prediction", keywords="predictive", keywords="deterioration", keywords="retrospective", keywords="data", keywords="dataset", keywords="artificial intelligence", keywords="AI", keywords="chest", keywords="patient", keywords="hospitalized", abstract="Background: The early detection of clinical deterioration and timely intervention for hospitalized patients can improve patient outcomes. The currently existing early warning systems rely on variables from structured data, such as vital signs and laboratory values, and do not incorporate other potentially predictive data modalities. Because respiratory failure is a common cause of deterioration, chest radiographs are often acquired in patients with clinical deterioration, which may be informative for predicting their risk of intensive care unit (ICU) transfer. Objective: This study aimed to compare and validate different computer vision models and data augmentation approaches with chest radiographs for predicting clinical deterioration. Methods: This retrospective observational study included adult patients hospitalized at the University of Wisconsin Health System between 2009 and 2020 with an elevated electronic cardiac arrest risk triage (eCART) score, a validated clinical deterioration early warning score, on the medical-surgical wards. Patients with a chest radiograph obtained within 48 hours prior to the elevated score were included in this study. Five computer vision model architectures (VGG16, DenseNet121, Vision Transformer, ResNet50, and Inception V3) and four data augmentation methods (histogram normalization, random flip, random Gaussian noise, and random rotate) were compared using the area under the receiver operating characteristic curve (AUROC) and the area under the precision-recall curve (AUPRC) for predicting clinical deterioration (ie, ICU transfer or ward death in the following 24 hours). Results: The study included 21,817 patient admissions, of which 1655 (7.6\%) experienced clinical deterioration. The DenseNet121 model pretrained on chest radiograph datasets with histogram normalization and random Gaussian noise augmentation had the highest discrimination (AUROC 0.734 and AUPRC 0.414), while the vision transformer having 24 transformer blocks with random rotate augmentation had the lowest discrimination (AUROC 0.598). Conclusions: The study shows the potential of chest radiographs in deep learning models for predicting clinical deterioration. The DenseNet121 architecture pretrained with chest radiographs performed better than other architectures in most experiments, and the addition of histogram normalization with random Gaussian noise data augmentation may enhance the performance of DenseNet121 and pretrained VGG16 architectures. ", doi="10.2196/67144", url="https://ai.jmir.org/2025/1/e67144" } @Article{info:doi/10.2196/59632, author="Hwang, Misun and Zheng, Yaguang and Cho, Youmin and Jiang, Yun", title="AI Applications for Chronic Condition Self-Management: Scoping Review", journal="J Med Internet Res", year="2025", month="Apr", day="8", volume="27", pages="e59632", keywords="artificial intelligence", keywords="chronic disease", keywords="self-management", keywords="generative AI", keywords="emotional self-management", abstract="Background: Artificial intelligence (AI) has potential in promoting and supporting self-management in patients with chronic conditions. However, the development and application of current AI technologies to meet patients' needs and improve their performance in chronic condition self-management tasks remain poorly understood. It is crucial to gather comprehensive information to guide the development and selection of effective AI solutions tailored for self-management in patients with chronic conditions. Objective: This scoping review aimed to provide a comprehensive overview of AI applications for chronic condition self-management based on 3 essential self-management tasks, medical, behavioral, and emotional self-management, and to identify the current developmental stages and knowledge gaps of AI applications for chronic condition self-management. Methods: A literature review was conducted for studies published in English between January 2011 and October 2024. In total, 4 databases, including PubMed, Web of Science, CINAHL, and PsycINFO, were searched using combined terms related to self-management and AI. The inclusion criteria included studies focused on the adult population with any type of chronic condition and AI technologies supporting self-management. This review was conducted following the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses Extension for Scoping Reviews) guidelines. Results: Of the 1873 articles retrieved from the search, 66 (3.5\%) were eligible and included in this review. The most studied chronic condition was diabetes (20/66, 30\%). Regarding self-management tasks, most studies aimed to support medical (45/66, 68\%) or behavioral self-management (27/66, 41\%), and fewer studies focused on emotional self-management (14/66, 21\%). Conversational AI (21/66, 32\%) and multiple machine learning algorithms (16/66, 24\%) were the most used AI technologies. However, most AI technologies remained in the algorithm development (25/66, 38\%) or early feasibility testing stages (25/66, 38\%). Conclusions: A variety of AI technologies have been developed and applied in chronic condition self-management, primarily for medication, symptoms, and lifestyle self-management. Fewer AI technologies were developed for emotional self-management tasks, and most AIs remained in the early developmental stages. More research is needed to generate evidence for integrating AI into chronic condition self-management to obtain optimal health outcomes. ", doi="10.2196/59632", url="https://www.jmir.org/2025/1/e59632" } @Article{info:doi/10.2196/57421, author="Perakslis, Eric and Nolen, Kimberly and Fricklas, Ethan and Tubb, Tracy", title="Striking a Balance: Innovation, Equity, and Consistency in AI Health Technologies", journal="JMIR AI", year="2025", month="Apr", day="7", volume="4", pages="e57421", keywords="artificial intelligence", keywords="algorithm", keywords="regulatory landscape", keywords="predictive model", keywords="predictive analytics", keywords="predictive system", keywords="practical model", keywords="machine learning", keywords="large language model", keywords="natural language processing", keywords="deep learning", keywords="digital health", keywords="regulatory", keywords="health technology", doi="10.2196/57421", url="https://ai.jmir.org/2025/1/e57421" } @Article{info:doi/10.2196/64447, author="Castellanos, Arturo and Jiang, Haoqiang and Gomes, Paulo and Vander Meer, Debra and Castillo, Alfred", title="Large Language Models for Thematic Summarization in Qualitative Health Care Research: Comparative Analysis of Model and Human Performance", journal="JMIR AI", year="2025", month="Apr", day="4", volume="4", pages="e64447", keywords="artificial intelligence", keywords="generative AI", keywords="large language models", keywords="ChatGPT", keywords="machine learning", keywords="health care", abstract="Background: The application of large language models (LLMs) in analyzing expert textual online data is a topic of growing importance in computational linguistics and qualitative research within health care settings. Objective: The objective of this study was to understand how LLMs can help analyze expert textual data. Topic modeling enables scaling the thematic analysis of content of a large corpus of data, but it still requires interpretation. We investigate the use of LLMs to help researchers scale this interpretation. Methods: The primary methodological phases of this project were (1) collecting data representing posts to an online nurse forum, as well as cleaning and preprocessing the data; (2) using latent Dirichlet allocation (LDA) to derive topics; (3) using human categorization for topic modeling; and (4) using LLMs to complement and scale the interpretation of thematic analysis. The purpose is to compare the outcomes of human interpretation with those derived from LLMs. Results: There is substantial agreement (247/310, 80\%) between LLM and human interpretation. For two-thirds of the topics, human evaluation and LLMs agree on alignment and convergence of themes. Furthermore, LLM subthemes offer depth of analysis within LDA topics, providing detailed explanations that align with and build upon established human themes. Nonetheless, LLMs identify coherence and complementarity where human evaluation does not. Conclusions: LLMs enable the automation of the interpretation task in qualitative research. There are challenges in the use of LLMs for evaluation of the resulting themes. ", doi="10.2196/64447", url="https://ai.jmir.org/2025/1/e64447" } @Article{info:doi/10.2196/68809, author="Groene, Nicole and Nickel, Audrey and Rohn, E. Amanda", title="Insights on the Side Effects of Female Contraceptive Products From Online Drug Reviews: Natural Language Processing--Based Content Analysis", journal="JMIR AI", year="2025", month="Apr", day="3", volume="4", pages="e68809", keywords="contraception", keywords="side effects", keywords="natural language processing", keywords="NLP", keywords="informed choices", keywords="online reviews", keywords="women", keywords="well-being", abstract="Background: Most online and social media discussions about birth control methods for women center on side effects, highlighting a demand for shared experiences with these products. Online user reviews and ratings of birth control products offer a largely untapped supplementary resource that could assist women and their partners in making informed contraception choices. Objective: This study sought to analyze women's online ratings and reviews of various birth control methods, focusing on side effects linked to low product ratings. Methods: Using natural language processing (NLP) for topic modeling and descriptive statistics, this study analyzes 19,506 unique reviews of female contraceptive products posted on the website Drugs.com. Results: Ratings vary widely across contraception types. Hormonal contraceptives with high systemic absorption, such as progestin-only pills and extended-cycle pills, received more unfavorable reviews than other methods and women frequently described menstrual irregularities, continuous bleeding, and weight gain associated with their administration. Intrauterine devices were generally rated more positively, although about 1 in 10 users reported severe cramps and pain, which were linked to very poor ratings. Conclusions: While exploratory, this study highlights the potential of NLP in analyzing extensive online reviews to reveal insights into women's experiences with contraceptives and the impact of side effects on their overall well-being. In addition to results from clinical studies, NLP-derived insights from online reviews can provide complementary information for women and health care providers, despite possible biases in online reviews. The findings suggest a need for further research to validate links between specific side effects, contraceptive methods, and women's overall well-being. ", doi="10.2196/68809", url="https://ai.jmir.org/2025/1/e68809" } @Article{info:doi/10.2196/57828, author="Dor{\'e}mus, Oc{\'e}ane and Russon, Dylan and Contrand, Benjamin and Guerra-Adames, Ariel and Avalos-Fernandez, Marta and Gil-Jardin{\'e}, C{\'e}dric and Lagarde, Emmanuel", title="Harnessing Moderate-Sized Language Models for Reliable Patient Data Deidentification in Emergency Department Records: Algorithm Development, Validation, and Implementation Study", journal="JMIR AI", year="2025", month="Apr", day="1", volume="4", pages="e57828", keywords="de-identification", keywords="machine learning", keywords="large language model", keywords="natural language processing", keywords="electronic health records", keywords="transformers", keywords="general data protection regulation", keywords="clinical notes", abstract="Background: The digitization of health care, facilitated by the adoption of electronic health records systems, has revolutionized data-driven medical research and patient care. While this digital transformation offers substantial benefits in health care efficiency and accessibility, it concurrently raises significant concerns over privacy and data security. Initially, the journey toward protecting patient data deidentification saw the transition from rule-based systems to more mixed approaches including machine learning for deidentifying patient data. Subsequently, the emergence of large language models has represented a further opportunity in this domain, offering unparalleled potential for enhancing the accuracy of context-sensitive deidentification. However, despite large language models offering significant potential, the deployment of the most advanced models in hospital environments is frequently hindered by data security issues and the extensive hardware resources required. Objective: The objective of our study is to design, implement, and evaluate deidentification algorithms using fine-tuned moderate-sized open-source language models, ensuring their suitability for production inference tasks on personal computers. Methods: We aimed to replace personal identifying information (PII) with generic placeholders or labeling non-PII texts as ``ANONYMOUS,'' ensuring privacy while preserving textual integrity. Our dataset, derived from over 425,000 clinical notes from the adult emergency department of the Bordeaux University Hospital in France, underwent independent double annotation by 2 experts to create a reference for model validation with 3000 clinical notes randomly selected. Three open-source language models of manageable size were selected for their feasibility in hospital settings: Llama 2 (Meta) 7B, Mistral 7B, and Mixtral 8{\texttimes}7B (Mistral AI). Fine-tuning used the quantized low-rank adaptation technique. Evaluation focused on PII-level (recall, precision, and F1-score) and clinical note-level metrics (recall and BLEU [bilingual evaluation understudy] metric), assessing deidentification effectiveness and content preservation. Results: The generative model Mistral 7B performed the highest with an overall F1-score of 0.9673 (vs 0.8750 for Llama 2 and 0.8686 for Mixtral 8{\texttimes}7B). At the clinical notes level, the model's overall recall was 0.9326 (vs 0.6888 for Llama 2 and 0.6417 for Mixtral 8{\texttimes}7B). This rate increased to 0.9915 when Mistral 7B only deleted names. Four notes of 3000 failed to be fully pseudonymized for names: in 1 case, the nondeleted name belonged to a patient, while in the others, it belonged to medical staff. Beyond the fifth epoch, the BLEU score consistently exceeded 0.9864, indicating no significant text alteration. Conclusions: Our research underscores the significant capabilities of generative natural language processing models, with Mistral 7B standing out for its superior ability to deidentify clinical texts efficiently. Achieving notable performance metrics, Mistral 7B operates effectively without requiring high-end computational resources. These methods pave the way for a broader availability of pseudonymized clinical texts, enabling their use for research purposes and the optimization of the health care system. ", doi="10.2196/57828", url="https://ai.jmir.org/2025/1/e57828" } @Article{info:doi/10.2196/62985, author="Loh, Rong De and Hill, D. Elliot and Liu, Nan and Dawson, Geraldine and Engelhard, M. Matthew", title="Limitations of Binary Classification for Long-Horizon Diagnosis Prediction and Advantages of a Discrete-Time Time-to-Event Approach: Empirical Analysis", journal="JMIR AI", year="2025", month="Mar", day="27", volume="4", pages="e62985", keywords="machine learning", keywords="artificial intelligence", keywords="deep learning", keywords="predictive models", keywords="practical models", keywords="early detection", keywords="electronic health records", keywords="right-censoring", keywords="survival analysis", keywords="distributional shifts", abstract="Background: A major challenge in using electronic health records (EHR) is the inconsistency of patient follow-up, resulting in right-censored outcomes. This becomes particularly problematic in long-horizon event predictions, such as autism and attention-deficit/hyperactivity disorder (ADHD) diagnoses, where a significant number of patients are lost to follow-up before the outcome can be observed. Consequently, fully supervised methods such as binary classification (BC), which are trained to predict observed diagnoses, are substantially affected by the probability of sufficient follow-up, leading to biased results. Objective: This empirical analysis aims to characterize BC's inherent limitations for long-horizon diagnosis prediction from EHR; and quantify the benefits of a specific time-to-event (TTE) approach, the discrete-time neural network (DTNN). Methods: Records within the Duke University Health System EHR were analyzed, extracting features such as ICD-10 (International Classification of Diseases, Tenth Revision) diagnosis codes, medications, laboratories, and procedures. We compared a DTNN to 3 BC approaches and a deep Cox proportional hazards model across 4 clinical conditions to examine distributional patterns across various subgroups. Time-varying area under the receiving operating characteristic curve (AUCt) and time-varying average precision (APt) were our primary evaluation metrics. Results: TTE models consistently had comparable or higher AUCt and APt than BC for all conditions. At clinically relevant operating time points, the area under the receiving operating characteristic curve (AUC) values for DTNNYOB?2020 (year-of-birth) and DCPHYOB?2020 (deep Cox proportional hazard) were 0.70 (95\% CI 0.66?0.77) and 0.72 (95\% CI 0.66?0.78) at t=5 for autism, 0.72 (95\% CI 0.65?0.76) and 0.68 (95\% CI 0.62?0.74) at t=7 for ADHD, 0.72 (95\% CI 0.70?0.75) and 0.71 (95\% CI 0.69?0.74) at t=1 for recurrent otitis media, and 0.74 (95\% CI 0.68?0.82) and 0.71 (95\% CI 0.63?0.77) at t=1 for food allergy, compared to 0.6 (95\% CI 0.55?0.66), 0.47 (95\% CI 0.40?0.54), 0.73 (95\% CI 0.70?0.75), and 0.77 (95\% CI 0.71?0.82) for BCYOB?2020, respectively. The probabilities predicted by BC models were positively correlated with censoring times, particularly for autism and ADHD prediction. Filtering strategies based on YOB or length of follow-up only partially corrected these biases. In subgroup analyses, only DTNN predicted diagnosis probabilities that accurately reflect actual clinical prevalence and temporal trends. Conclusions: BC models substantially underpredicted diagnosis likelihood and inappropriately assigned lower probability scores to individuals with earlier censoring. Common filtering strategies did not adequately address this limitation. TTE approaches, particularly DTNN, effectively mitigated bias from the censoring distribution, resulting in superior discrimination and calibration performance and more accurate prediction of clinical prevalence. Machine learning practitioners should recognize the limitations of BC for long-horizon diagnosis prediction and adopt TTE approaches. The DTNN in particular is well-suited to mitigate the effects of right-censoring and maximize prediction performance in this setting. ", doi="10.2196/62985", url="https://ai.jmir.org/2025/1/e62985" } @Article{info:doi/10.2196/67363, author="Roshani, Amin Mohammad and Zhou, Xiangyu and Qiang, Yao and Suresh, Srinivasan and Hicks, Steven and Sethuraman, Usha and Zhu, Dongxiao", title="Generative Large Language Model---Powered Conversational AI App for Personalized Risk Assessment: Case Study in COVID-19", journal="JMIR AI", year="2025", month="Mar", day="27", volume="4", pages="e67363", keywords="personalized risk assessment", keywords="large language model", keywords="conversational AI", keywords="artificial intelligence", keywords="COVID-19", abstract="Background: Large language models (LLMs) have demonstrated powerful capabilities in natural language tasks and are increasingly being integrated into health care for tasks like disease risk assessment. Traditional machine learning methods rely on structured data and coding, limiting their flexibility in dynamic clinical environments. This study presents a novel approach to disease risk assessment using generative LLMs through conversational artificial intelligence (AI), eliminating the need for programming. Objective: This study evaluates the use of pretrained generative LLMs, including LLaMA2-7b and Flan-T5-xl, for COVID-19 severity prediction with the goal of enabling a real-time, no-code, risk assessment solution through chatbot-based, question-answering interactions. To contextualize their performance, we compare LLMs with traditional machine learning classifiers, such as logistic regression, extreme gradient boosting (XGBoost), and random forest, which rely on tabular data. Methods: We fine-tuned LLMs using few-shot natural language examples from a dataset of 393 pediatric patients, developing a mobile app that integrates these models to provide real-time, no-code, COVID-19 severity risk assessment through clinician-patient interaction. The LLMs were compared with traditional classifiers across different experimental settings, using the area under the curve (AUC) as the primary evaluation metric. Feature importance derived from LLM attention layers was also analyzed to enhance interpretability. Results: Generative LLMs demonstrated strong performance in low-data settings. In zero-shot scenarios, the T0-3b-T model achieved an AUC of 0.75, while other LLMs, such as T0pp(8bit)-T and Flan-T5-xl-T, reached 0.67 and 0.69, respectively. At 2-shot settings, logistic regression and random forest achieved an AUC of 0.57, while Flan-T5-xl-T and T0-3b-T obtained 0.69 and 0.65, respectively. By 32-shot settings, Flan-T5-xl-T reached 0.70, similar to logistic regression (0.69) and random forest (0.68), while XGBoost improved to 0.65. These results illustrate the differences in how generative LLMs and traditional models handle the increasing data availability. LLMs perform well in low-data scenarios, whereas traditional models rely more on structured tabular data and labeled training examples. Furthermore, the mobile app provides real-time, COVID-19 severity assessments and personalized insights through attention-based feature importance, adding value to the clinical interpretation of the results. Conclusions: Generative LLMs provide a robust alternative to traditional classifiers, particularly in scenarios with limited labeled data. Their ability to handle unstructured inputs and deliver personalized, real-time assessments without coding makes them highly adaptable to clinical settings. This study underscores the potential of LLM-powered conversational artificial intelligence (AI) in health care and encourages further exploration of its use for real-time, disease risk assessment and decision-making support. ", doi="10.2196/67363", url="https://ai.jmir.org/2025/1/e67363" } @Article{info:doi/10.2196/69820, author="Waaler, Niklas Per and Hussain, Musarrat and Molchanov, Igor and Bongo, Ailo Lars and Elvev{\aa}g, Brita", title="Prompt Engineering an Informational Chatbot for Education on Mental Health Using a Multiagent Approach for Enhanced Compliance With Prompt Instructions: Algorithm Development and Validation", journal="JMIR AI", year="2025", month="Mar", day="26", volume="4", pages="e69820", keywords="schizophrenia", keywords="mental health", keywords="prompt engineering", keywords="AI in health care", keywords="AI safety", keywords="self-reflection", keywords="limiting scope of AI", keywords="large language model", keywords="LLM", keywords="GPT-4", keywords="AI transparency", keywords="adaptive learning", abstract="Background: People with schizophrenia often present with cognitive impairments that may hinder their ability to learn about their condition. Education platforms powered by large language models (LLMs) have the potential to improve the accessibility of mental health information. However, the black-box nature of LLMs raises ethical and safety concerns regarding the controllability of chatbots. In particular, prompt-engineered chatbots may drift from their intended role as the conversation progresses and become more prone to hallucinations. Objective: This study aimed to develop and evaluate a critical analysis filter (CAF) system that ensures that an LLM-powered prompt-engineered chatbot reliably complies with its predefined instructions and scope while delivering validated mental health information. Methods: For a proof of concept, we prompt engineered an educational chatbot for schizophrenia powered by GPT-4 that could dynamically access information from a schizophrenia manual written for people with schizophrenia and their caregivers. In the CAF, a team of prompt-engineered LLM agents was used to critically analyze and refine the chatbot's responses and deliver real-time feedback to the chatbot. To assess the ability of the CAF to re-establish the chatbot's adherence to its instructions, we generated 3 conversations (by conversing with the chatbot with the CAF disabled) wherein the chatbot started to drift from its instructions toward various unintended roles. We used these checkpoint conversations to initialize automated conversations between the chatbot and adversarial chatbots designed to entice it toward unintended roles. Conversations were repeatedly sampled with the CAF enabled and disabled. In total, 3 human raters independently rated each chatbot response according to criteria developed to measure the chatbot's integrity, specifically, its transparency (such as admitting when a statement lacked explicit support from its scripted sources) and its tendency to faithfully convey the scripted information in the schizophrenia manual. Results: In total, 36 responses (3 different checkpoint conversations, 3 conversations per checkpoint, and 4 adversarial queries per conversation) were rated for compliance with the CAF enabled and disabled. Activating the CAF resulted in a compliance score that was considered acceptable (?2) in 81\% (7/36) of the responses, compared to only 8.3\% (3/36) when the CAF was deactivated. Conclusions: Although more rigorous testing in realistic scenarios is needed, our results suggest that self-reflection mechanisms could enable LLMs to be used effectively and safely in educational mental health platforms. This approach harnesses the flexibility of LLMs while reliably constraining their scope to appropriate and accurate interactions. ", doi="10.2196/69820", url="https://ai.jmir.org/2025/1/e69820", url="http://www.ncbi.nlm.nih.gov/pubmed/39992720" } @Article{info:doi/10.2196/59094, author="Dawadi, Research and Inoue, Mai and Tay, Ting Jie and Martin-Morales, Agustin and Vu, Thien and Araki, Michihiro", title="Disease Prediction Using Machine Learning on Smartphone-Based Eye, Skin, and Voice Data: Scoping Review", journal="JMIR AI", year="2025", month="Mar", day="25", volume="4", pages="e59094", keywords="literature review", keywords="machine learning", keywords="smartphone", keywords="health diagnosis", abstract="Background: The application of machine learning methods to data generated by ubiquitous devices like smartphones presents an opportunity to enhance the quality of health care and diagnostics. Smartphones are ideal for gathering data easily, providing quick feedback on diagnoses, and proposing interventions for health improvement. Objective: We reviewed the existing literature to gather studies that have used machine learning models with smartphone-derived data for the prediction and diagnosis of health anomalies. We divided the studies into those that used machine learning models by conducting experiments to retrieve data and predict diseases, and those that used machine learning models on publicly available databases. The details of databases, experiments, and machine learning models are intended to help researchers working in the fields of machine learning and artificial intelligence in the health care domain. Researchers can use the information to design their experiments or determine the databases they could analyze. Methods: A comprehensive search of the PubMed and IEEE Xplore databases was conducted, and an in-house keyword screening method was used to filter the articles based on the content of their titles and abstracts. Subsequently, studies related to the 3 areas of voice, skin, and eye were selected and analyzed based on how data for machine learning models were extracted (ie, the use of publicly available databases or through experiments). The machine learning methods used in each study were also noted. Results: A total of 49 studies were identified as being relevant to the topic of interest, and among these studies, there were 31 different databases and 24 different machine learning methods. Conclusions: The results provide a better understanding of how smartphone data are collected for predicting different diseases and what kinds of machine learning methods are used on these data. Similarly, publicly available databases having smartphone-based data that can be used for the diagnosis of various diseases have been presented. Our screening method could be used or improved in future studies, and our findings could be used as a reference to conduct similar studies, experiments, or statistical analyses. ", doi="10.2196/59094", url="https://ai.jmir.org/2025/1/e59094" } @Article{info:doi/10.2196/66273, author="Gyrard, Amelie and Abedian, Somayeh and Gribbon, Philip and Manias, George and van Nuland, Rick and Zatloukal, Kurt and Nicolae, Emilia Irina and Danciu, Gabriel and Nechifor, Septimiu and Marti-Bonmati, Luis and Mallol, Pedro and Dalmiani, Stefano and Autexier, Serge and Jendrossek, Mario and Avramidis, Ioannis and Garcia Alvarez, Eva and Holub, Petr and Blanquer, Ignacio and Boden, Anna and Hussein, Rada", title="Lessons Learned From European Health Data Projects With Cancer Use Cases: Implementation of Health Standards and Internet of Things Semantic Interoperability", journal="J Med Internet Res", year="2025", month="Mar", day="24", volume="27", pages="e66273", keywords="artificial intelligence", keywords="cancer", keywords="European Health Data Space", keywords="health care standards", keywords="interoperability", keywords="AI", keywords="health data", keywords="cancer use cases", keywords="IoT", keywords="Internet of Things", keywords="primary data", keywords="diagnosis", keywords="prognosis", keywords="decision-making", doi="10.2196/66273", url="https://www.jmir.org/2025/1/e66273", url="http://www.ncbi.nlm.nih.gov/pubmed/40126534" } @Article{info:doi/10.2196/58375, author="Madrid, Julian and Diehl, Philipp and Selig, Mischa and Rolauffs, Bernd and Hans, Patricius Felix and Busch, Hans-J{\"o}rg and Scheef, Tobias and Benning, Leo", title="Performance of Plug-In Augmented ChatGPT and Its Ability to Quantify Uncertainty: Simulation Study on the German Medical Board Examination", journal="JMIR Med Educ", year="2025", month="Mar", day="21", volume="11", pages="e58375", keywords="medical education", keywords="artificial intelligence", keywords="generative AI", keywords="large language model", keywords="LLM", keywords="ChatGPT", keywords="GPT-4", keywords="board licensing examination", keywords="professional education", keywords="examination", keywords="student", keywords="experimental", keywords="bootstrapping", keywords="confidence interval", abstract="Background: The GPT-4 is a large language model (LLM) trained and fine-tuned on an extensive dataset. After the public release of its predecessor in November 2022, the use of LLMs has seen a significant spike in interest, and a multitude of potential use cases have been proposed. In parallel, however, important limitations have been outlined. Particularly, current LLMs encounter limitations, especially in symbolic representation and accessing contemporary data. The recent version of GPT-4, alongside newly released plugin features, has been introduced to mitigate some of these limitations. Objective: Before this background, this work aims to investigate the performance of GPT-3.5, GPT-4, GPT-4 with plugins, and GPT-4 with plugins using pretranslated English text on the German medical board examination. Recognizing the critical importance of quantifying uncertainty for LLM applications in medicine, we furthermore assess this ability and develop a new metric termed ``confidence accuracy'' to evaluate it. Methods: We used GPT-3.5, GPT-4, GPT-4 with plugins, and GPT-4 with plugins and translation to answer questions from the German medical board examination. Additionally, we conducted an analysis to assess how the models justify their answers, the accuracy of their responses, and the error structure of their answers. Bootstrapping and CIs were used to evaluate the statistical significance of our findings. Results: This study demonstrated that available GPT models, as LLM examples, exceeded the minimum competency threshold established by the German medical board for medical students to obtain board certification to practice medicine. Moreover, the models could assess the uncertainty in their responses, albeit exhibiting overconfidence. Additionally, this work unraveled certain justification and reasoning structures that emerge when GPT generates answers. Conclusions: The high performance of GPTs in answering medical questions positions it well for applications in academia and, potentially, clinical practice. Its capability to quantify uncertainty in answers suggests it could be a valuable artificial intelligence agent within the clinical decision-making loop. Nevertheless, significant challenges must be addressed before artificial intelligence agents can be robustly and safely implemented in the medical domain. ", doi="10.2196/58375", url="https://mededu.jmir.org/2025/1/e58375" } @Article{info:doi/10.2196/58897, author="Tseng, Liang-Wei and Lu, Yi-Chin and Tseng, Liang-Chi and Chen, Yu-Chun and Chen, Hsing-Yu", title="Performance of ChatGPT-4 on Taiwanese Traditional Chinese Medicine Licensing Examinations: Cross-Sectional Study", journal="JMIR Med Educ", year="2025", month="Mar", day="19", volume="11", pages="e58897", keywords="artificial intelligence", keywords="AI language understanding tools", keywords="ChatGPT", keywords="natural language processing", keywords="machine learning", keywords="Chinese medicine license exam", keywords="Chinese medical licensing examination", keywords="medical education", keywords="traditional Chinese medicine", keywords="large language model", abstract="Background: The integration of artificial intelligence (AI), notably ChatGPT, into medical education, has shown promising results in various medical fields. Nevertheless, its efficacy in traditional Chinese medicine (TCM) examinations remains understudied. Objective: This study aims to (1) assess the performance of ChatGPT on the TCM licensing examination in Taiwan and (2) evaluate the model's explainability in answering TCM-related questions to determine its suitability as a TCM learning tool. Methods: We used the GPT-4 model to respond to 480 questions from the 2022 TCM licensing examination. This study compared the performance of the model against that of licensed TCM doctors using 2 approaches, namely direct answer selection and provision of explanations before answer selection. The accuracy and consistency of AI-generated responses were analyzed. Moreover, a breakdown of question characteristics was performed based on the cognitive level, depth of knowledge, types of questions, vignette style, and polarity of questions. Results: ChatGPT achieved an overall accuracy of 43.9\%, which was lower than that of 2 human participants (70\% and 78.4\%). The analysis did not reveal a significant correlation between the accuracy of the model and the characteristics of the questions. An in-depth examination indicated that errors predominantly resulted from a misunderstanding of TCM concepts (55.3\%), emphasizing the limitations of the model with regard to its TCM knowledge base and reasoning capability. Conclusions: Although ChatGPT shows promise as an educational tool, its current performance on TCM licensing examinations is lacking. This highlights the need for enhancing AI models with specialized TCM training and suggests a cautious approach to utilizing AI for TCM education. Future research should focus on model improvement and the development of tailored educational applications to support TCM learning. ", doi="10.2196/58897", url="https://mededu.jmir.org/2025/1/e58897" } @Article{info:doi/10.2196/69150, author="Twumasi, Clement and Aktas, Mikail and Santoni, Nicholas", title="Kinetic Pattern Recognition in Home-Based Knee Rehabilitation Using Machine Learning Clustering Methods on the Slider Digital Physiotherapy Device: Prospective Observational Study", journal="JMIR Form Res", year="2025", month="Mar", day="18", volume="9", pages="e69150", keywords="machine learning", keywords="cluster analysis", keywords="force measurement", keywords="knee replacement", keywords="musculoskeletal", keywords="physical therapy", keywords="Slider device", keywords="knee osteoarthritis", keywords="digital health", keywords="telerehabilitation", abstract="Background: Recent advancements in rehabilitation sciences have progressively used computational techniques to improve diagnostic and treatment approaches. However, the analysis of high-dimensional, time-dependent data continues to pose a significant problem. Prior research has used clustering techniques on rehabilitation data to identify movement patterns and forecast recovery outcomes. Nonetheless, these initiatives have not yet used force or motion datasets obtained outside a clinical setting, thereby limiting the capacity for therapeutic decisions. Biomechanical data analysis has demonstrated considerable potential in bridging these gaps and improving clinical decision-making in rehabilitation settings. Objective: This study presents a comprehensive clustering analysis of multidimensional movement datasets captured using a novel home exercise device, the ``Slider''. The aim is to identify clinically relevant movement patterns and provide answers to open research questions for the first time to inform personalized rehabilitation protocols, predict individual recovery trajectories, and assess the risks of potential postoperative complications. Methods: High-dimensional, time-dependent, bilateral knee kinetic datasets were independently analyzed from 32 participants using four unsupervised clustering techniques: k-means, hierarchical clustering, partition around medoids, and CLARA (Clustering Large Applications). The data comprised force, laser-measured distance, and optical tracker coordinates from lower limb activities. The optimal clusters identified through the unsupervised clustering methods were further evaluated and compared using silhouette analysis to quantify their performance. Key determinants of cluster membership were assessed, including demographic factors (eg, gender, BMI, and age) and pain levels, by using a logistic regression model with analysis of covariance adjustment. Results: Three distinct, time-varying movement patterns or clusters were identified for each knee. Hierarchical clustering performed best for the right knee datasets (with an average silhouette score of 0.637), while CLARA was the most effective for the left knee datasets (with an average silhouette score of 0.598). Key predictors of the movement cluster membership were discovered for both knees. BMI was the most influential determinant of cluster membership for the right knee, where higher BMI decreased the odds of cluster-2 membership (odds ratio [OR] 0.95, 95\% CI 0.94-0.96; P<.001) but increased the odds for cluster-3 assignment relative to cluster 1 (OR 1.05, 95\% CI 1.03-1.06; P<.001). For the left knee, all predictors of cluster-2 membership were significant (.001?P?.008), whereas only BMI (P=.81) could not predict the likelihood of an individual belonging to cluster 3 compared to cluster 1. Gender was the strongest determinant for the left knee, with male participants significantly likely to belong to cluster 3 (OR 3.52, 95\% CI 2.91-4.27; P<.001). Conclusions: These kinetic patterns offer significant insights for creating personalized rehabilitation procedures, potentially improving patient outcomes. These findings underscore the efficacy of unsupervised clustering techniques in the analysis of biomechanical data for clinical rehabilitation applications. ", doi="10.2196/69150", url="https://formative.jmir.org/2025/1/e69150" } @Article{info:doi/10.2196/55277, author="Lau, Jerry and Bisht, Shivani and Horton, Robert and Crisan, Annamaria and Jones, John and Gantotti, Sandeep and Hermes-DeSantis, Evelyn", title="Creation of Scientific Response Documents for Addressing Product Medical Information Inquiries: Mixed Method Approach Using Artificial Intelligence", journal="JMIR AI", year="2025", month="Mar", day="13", volume="4", pages="e55277", keywords="AI", keywords="LLM", keywords="GPT", keywords="biopharmaceutical", keywords="medical information", keywords="content generation", keywords="artificial intelligence", keywords="pharmaceutical", keywords="scientific response", keywords="documentation", keywords="information", keywords="clinical data", keywords="strategy", keywords="reference", keywords="feasibility", keywords="development", keywords="machine learning", keywords="large language model", keywords="accuracy", keywords="context", keywords="traceability", keywords="accountability", keywords="survey", keywords="scientific response documentation", keywords="SRD", keywords="benefit", keywords="content generator", keywords="content analysis", keywords="Generative Pre-trained Transformer", abstract="Background: Pharmaceutical manufacturers address health care professionals' information needs through scientific response documents (SRDs), offering evidence-based answers to medication and disease state questions. Medical information departments, staffed by medical experts, develop SRDs that provide concise summaries consisting of relevant background information, search strategies, clinical data, and balanced references. With an escalating demand for SRDs and the increasing complexity of therapies, medical information departments are exploring advanced technologies and artificial intelligence (AI) tools like large language models (LLMs) to streamline content development. While AI and LLMs show promise in generating draft responses, a synergistic approach combining an LLM with traditional machine learning classifiers in a series of human-supervised and -curated steps could help address limitations, including hallucinations. This will ensure accuracy, context, traceability, and accountability in the development of the concise clinical data summaries of an SRD. Objective: This study aims to quantify the challenges of SRD development and develop a framework exploring the feasibility and value addition of integrating AI capabilities in the process of creating concise summaries for an SRD. Methods: To measure the challenges in SRD development, a survey was conducted by phactMI, a nonprofit consortium of medical information leaders in the pharmaceutical industry, assessing aspects of SRD creation among its member companies. The survey collected data on the time and tediousness of various activities related to SRD development. Another working group, consisting of medical information professionals and data scientists, used AI to aid SRD authoring, focusing on data extraction and abstraction. They used logistic regression on semantic embedding features to train classification models and transformer-based summarization pipelines to generate concise summaries. Results: Of the 33 companies surveyed, 64\% (21/33) opened the survey, and 76\% (16/21) of those responded. On average, medical information departments generate 614 new documents and update 1352 documents each year. Respondents considered paraphrasing scientific articles to be the most tedious and time-intensive task. In the project's second phase, sentence classification models showed the ability to accurately distinguish target categories with receiver operating characteristic scores ranging from 0.67 to 0.85 (all P<.001), allowing for accurate data extraction. For data abstraction, the comparison of the bilingual evaluation understudy (BLEU) score and semantic similarity in the paraphrased texts yielded different results among reviewers, with each preferring different trade-offs between these metrics. Conclusions: This study establishes a framework for integrating LLM and machine learning into SRD development, supported by a pharmaceutical company survey emphasizing the challenges of paraphrasing content. While machine learning models show potential for section identification and content usability assessment in data extraction and abstraction, further optimization and research are essential before full-scale industry implementation. The working group's insights guide an AI-driven content analysis; address limitations; and advance efficient, precise, and responsive frameworks to assist with pharmaceutical SRD development. ", doi="10.2196/55277", url="https://ai.jmir.org/2025/1/e55277" } @Article{info:doi/10.2196/59295, author="Grosser, John and D{\"u}vel, Juliane and Hasemann, Lena and Schneider, Emilia and Greiner, Wolfgang", title="Studying the Potential Effects of Artificial Intelligence on Physician Autonomy: Scoping Review", journal="JMIR AI", year="2025", month="Mar", day="13", volume="4", pages="e59295", keywords="autonomy, professional autonomy", keywords="physician autonomy", keywords="ethics", keywords="artificial intelligence", keywords="clinical decision support systems", keywords="CDSS", keywords="ethics of artificial intelligence", keywords="AI ethics", keywords="AI", keywords="scoping review", keywords="physician", keywords="acceptance", keywords="adoption", abstract="Background: Physician autonomy has been found to play a role in physician acceptance and adoption of artificial intelligence (AI) in medicine. However, there is still no consensus in the literature on how to define and assess physician autonomy. Furthermore, there is a lack of research focusing specifically on the potential effects of AI on physician autonomy. Objective: This scoping review addresses the following research questions: (1) How do qualitative studies conceptualize and assess physician autonomy? (2) Which aspects of physician autonomy are addressed by these studies? (3) What are the potential benefits and harms of AI for physician autonomy identified by these studies? Methods: We performed a scoping review of qualitative studies on AI and physician autonomy published before November 6, 2023, by searching MEDLINE and Web of Science. To answer research question 1, we determined whether the included studies explicitly include physician autonomy as a research focus and whether their interview, survey, and focus group questions explicitly name or implicitly include aspects of physician autonomy. To answer research question 2, we extracted the qualitative results of the studies, categorizing them into the 7 components of physician autonomy introduced by Schulz and Harrison. We then inductively formed subcomponents based on the results of the included studies in each component. To answer research question 3, we summarized the potentially harmful and beneficial effects of AI on physician autonomy in each of the inductively formed subcomponents. Results: The search yielded 369 studies after duplicates were removed. Of these, 27 studies remained after titles and abstracts were screened. After full texts were screened, we included a total of 7 qualitative studies. Most studies did not explicitly name physician autonomy as a research focus or explicitly address physician autonomy in their interview, survey, and focus group questions. No studies addressed a complete set of components of physician autonomy; while 3 components were addressed by all included studies, 2 components were addressed by none. We identified a total of 11 subcomponents for the 5 components of physician autonomy that were addressed by at least 1 study. For most of these subcomponents, studies reported both potential harms and potential benefits of AI for physician autonomy. Conclusions: Little research to date has explicitly addressed the potential effects of AI on physician autonomy and existing results on these potential effects are mixed. Further qualitative and quantitative research is needed that focuses explicitly on physician autonomy and addresses all relevant components of physician autonomy. ", doi="10.2196/59295", url="https://ai.jmir.org/2025/1/e59295" } @Article{info:doi/10.2196/67696, author="Pastrak, Mila and Kajitani, Sten and Goodings, James Anthony and Drewek, Austin and LaFree, Andrew and Murphy, Adrian", title="Evaluation of ChatGPT Performance on Emergency Medicine Board Examination Questions: Observational Study", journal="JMIR AI", year="2025", month="Mar", day="12", volume="4", pages="e67696", keywords="artificial intelligence", keywords="ChatGPT-4", keywords="medical education", keywords="emergency medicine", keywords="examination", keywords="examination preparation", abstract="Background: The ever-evolving field of medicine has highlighted the potential for ChatGPT as an assistive platform. However, its use in medical board examination preparation and completion remains unclear. Objective: This study aimed to evaluate the performance of a custom-modified version of ChatGPT-4, tailored with emergency medicine board examination preparatory materials (Anki flashcard deck), compared to its default version and previous iteration (3.5). The goal was to assess the accuracy of ChatGPT-4 answering board-style questions and its suitability as a tool to aid students and trainees in standardized examination preparation. Methods: A comparative analysis was conducted using a random selection of 598 questions from the Rosh In-Training Examination Question Bank. The subjects of the study included three versions of ChatGPT: the Default, a Custom, and ChatGPT-3.5. The accuracy, response length, medical discipline subgroups, and underlying causes of error were analyzed. Results: The Custom version did not demonstrate a significant improvement in accuracy over the Default version (P=.61), although both significantly outperformed ChatGPT-3.5 (P<.001). The Default version produced significantly longer responses than the Custom version, with the mean (SD) values being 1371 (444) and 929 (408), respectively (P<.001). Subgroup analysis revealed no significant difference in the performance across different medical subdisciplines between the versions (P>.05 in all cases). Both the versions of ChatGPT-4 had similar underlying error types (P>.05 in all cases) and had a 99\% predicted probability of passing while ChatGPT-3.5 had an 85\% probability. Conclusions: The findings suggest that while newer versions of ChatGPT exhibit improved performance in emergency medicine board examination preparation, specific enhancement with a comprehensive Anki flashcard deck on the topic does not significantly impact accuracy. The study highlights the potential of ChatGPT-4 as a tool for medical education, capable of providing accurate support across a wide range of topics in emergency medicine in its default form. ", doi="10.2196/67696", url="https://ai.jmir.org/2025/1/e67696" } @Article{info:doi/10.2196/70100, author="Vaijainthymala Krishnamoorthy, Mahesh", title="Data Obfuscation Through Latent Space Projection for Privacy-Preserving AI Governance: Case Studies in Medical Diagnosis and Finance Fraud Detection", journal="JMIRx Med", year="2025", month="Mar", day="12", volume="6", pages="e70100", keywords="privacy-preserving AI", keywords="latent space projection", keywords="data obfuscation", keywords="AI governance", keywords="machine learning privacy", keywords="differential privacy", keywords="k-anonymity", keywords="HIPAA", keywords="GDPR", keywords="compliance", keywords="data utility", keywords="privacy-utility trade-off", keywords="responsible AI", keywords="medical imaging privacy", keywords="secure data sharing", keywords="artificial intelligence", keywords="General Data Protection Regulation", keywords="Health Insurance Portability and Accountability Act", abstract="Background: The increasing integration of artificial intelligence (AI) systems into critical societal sectors has created an urgent demand for robust privacy-preserving methods. Traditional approaches such as differential privacy and homomorphic encryption often struggle to maintain an effective balance between protecting sensitive information and preserving data utility for AI applications. This challenge has become particularly acute as organizations must comply with evolving AI governance frameworks while maintaining the effectiveness of their AI systems. Objective: This paper aims to introduce and validate data obfuscation through latent space projection (LSP), a novel privacy-preserving technique designed to enhance AI governance and ensure responsible AI compliance. The primary goal is to develop a method that can effectively protect sensitive data while maintaining essential features necessary for AI model training and inference, thereby addressing the limitations of existing privacy-preserving approaches. Methods: We developed LSP using a combination of advanced machine learning techniques, specifically leveraging autoencoder architectures and adversarial training. The method projects sensitive data into a lower-dimensional latent space, where it separates sensitive from nonsensitive information. This separation enables precise control over privacy-utility trade-offs. We validated LSP through comprehensive experiments on benchmark datasets and implemented 2 real-world case studies: a health care application focusing on cancer diagnosis and a financial services application analyzing fraud detection. Results: LSP demonstrated superior performance across multiple evaluation metrics. In image classification tasks, the method achieved 98.7\% accuracy while maintaining strong privacy protection, providing 97.3\% effectiveness against sensitive attribute inference attacks. This performance significantly exceeded that of traditional anonymization and privacy-preserving methods. The real-world case studies further validated LSP's effectiveness, showing robust performance in both health care and financial applications. Additionally, LSP demonstrated strong alignment with global AI governance frameworks, including the General Data Protection Regulation, the California Consumer Privacy Act, and the Health Insurance Portability and Accountability Act. Conclusions: LSP represents a significant advancement in privacy-preserving AI, offering a promising approach to developing AI systems that respect individual privacy while delivering valuable insights. By embedding privacy protection directly within the machine learning pipeline, LSP contributes to key principles of fairness, transparency, and accountability. Future research directions include developing theoretical privacy guarantees, exploring integration with federated learning systems, and enhancing latent space interpretability. These developments position LSP as a crucial tool for advancing ethical AI practices and ensuring responsible technology deployment in privacy-sensitive domains. ", doi="10.2196/70100", url="https://xmed.jmir.org/2025/1/e70100" } @Article{info:doi/10.2196/60435, author="Habicht, Johanna and Dina, Larisa-Maria and McFadyen, Jessica and Stylianou, Mona and Harper, Ross and Hauser, U. Tobias and Rollwage, Max", title="Generative AI--Enabled Therapy Support Tool for Improved Clinical Outcomes and Patient Engagement in Group Therapy: Real-World Observational Study", journal="J Med Internet Res", year="2025", month="Mar", day="10", volume="27", pages="e60435", keywords="artificial intelligence", keywords="National Health Service", keywords="NHS Talking Therapies", keywords="mental health", keywords="therapy support tool", keywords="cognitive behavioral therapy", keywords="CBT", keywords="chatbot", keywords="conversational agent", keywords="clinical", keywords="patient engagement", keywords="therapist", keywords="treatment", keywords="medication", keywords="depression", keywords="anxiety disorder", keywords="exercise", keywords="observational study", keywords="control group", keywords="patient adherence", abstract="Background: Cognitive behavioral therapy (CBT) is a highly effective treatment for depression and anxiety disorders. Nonetheless, a substantial proportion of patients do not respond to treatment. The lack of engagement with therapeutic materials and exercises between sessions, a necessary component of CBT, is a key determinant of unsuccessful treatment. Objective: The objective of this study was to test whether the deployment of a generative artificial intelligence (AI)--enabled therapy support tool, which helps patients to engage with therapeutic materials and exercises in between sessions, leads to improved treatment success and patient treatment adherence compared with the standard delivery of CBT exercises through static workbooks. Methods: We conducted a real-world observational study of 244 patients receiving group-based CBT in 5 of the United Kingdom's National Health Service Talking Therapies services, comparing 150 (61.5\%) patients who used the AI-enabled therapy support tool to 94 (38.5\%) patients who used the standard delivery of CBT exercises. The groups were equivalent with respect to the content of the CBT materials and the human-led therapy sessions; however, the intervention group received support from the AI-enabled therapy support tool in conducting CBT exercises. Results: Patients using the AI-enabled therapy support tool exhibited greater attendance at therapy sessions and fewer dropouts from treatment. Furthermore, these patients demonstrated higher reliable improvement, recovery, and reliable recovery rates when compared to the control group, which was related to the degree of use of the AI-enabled therapy support tool. Moreover, we found that engagement with AI-supported CBT interventions, relative to psychoeducational materials, predicted better treatment adherence and treatment success, highlighting the role of personalization in the intervention's effectiveness. To investigate the mechanisms of these effects further, we conducted a separate qualitative experiment in a nonclinical sample of users (n=113). Results indicated that users perceived the AI-enabled therapy support tool as most useful for discussing their problems to gain awareness and clarity of their situation as well as learning how to apply coping skills and CBT techniques in their daily lives. Conclusions: Our results show that an AI-enabled, personalized therapy support tool in combination with human-led group therapy is a promising avenue to improve the efficacy of and adherence to mental health care. ", doi="10.2196/60435", url="https://www.jmir.org/2025/1/e60435", url="http://www.ncbi.nlm.nih.gov/pubmed/40063074" } @Article{info:doi/10.2196/60391, author="Shmilovitch, Haim Amit and Katson, Mark and Cohen-Shelly, Michal and Peretz, Shlomi and Aran, Dvir and Shelly, Shahar", title="GPT-4 as a Clinical Decision Support Tool in Ischemic Stroke Management: Evaluation Study", journal="JMIR AI", year="2025", month="Mar", day="7", volume="4", pages="e60391", keywords="GPT-4", keywords="ischemic stroke", keywords="clinical decision support", keywords="artificial intelligence", keywords="neurology", abstract="Background: Cerebrovascular diseases are the second most common cause of death worldwide and one of the major causes of disability burden. Advancements in artificial intelligence have the potential to revolutionize health care delivery, particularly in critical decision-making scenarios such as ischemic stroke management. Objective: This study aims to evaluate the effectiveness of GPT-4 in providing clinical support for emergency department neurologists by comparing its recommendations with expert opinions and real-world outcomes in acute ischemic stroke management. Methods: A cohort of 100 patients with acute stroke symptoms was retrospectively reviewed. Data used for decision-making included patients' history, clinical evaluation, imaging study results, and other relevant details. Each case was independently presented to GPT-4, which provided scaled recommendations (1-7) regarding the appropriateness of treatment, the use of tissue plasminogen activator, and the need for endovascular thrombectomy. Additionally, GPT-4 estimated the 90-day mortality probability for each patient and elucidated its reasoning for each recommendation. The recommendations were then compared with a stroke specialist's opinion and actual treatment decisions. Results: In our cohort of 100 patients, treatment recommendations by GPT-4 showed strong agreement with expert opinion (area under the curve [AUC] 0.85, 95\% CI 0.77-0.93) and real-world treatment decisions (AUC 0.80, 95\% CI 0.69-0.91). GPT-4 showed near-perfect agreement with real-world decisions in recommending endovascular thrombectomy (AUC 0.94, 95\% CI 0.89-0.98) and strong agreement for tissue plasminogen activator treatment (AUC 0.77, 95\% CI 0.68-0.86). Notably, in some cases, GPT-4 recommended more aggressive treatment than human experts, with 11 instances where GPT-4 suggested tissue plasminogen activator use against expert opinion. For mortality prediction, GPT-4 accurately identified 10 (77\%) out of 13 deaths within its top 25 high-risk predictions (AUC 0.89, 95\% CI 0.8077-0.9739; hazard ratio 6.98, 95\% CI 2.88-16.9; P<.001), outperforming supervised machine learning models such as PRACTICE (AUC 0.70; log-rank P=.02) and PREMISE (AUC 0.77; P=.07). Conclusions: This study demonstrates the potential of GPT-4 as a viable clinical decision-support tool in the management of acute stroke. Its ability to provide explainable recommendations without requiring structured data input aligns well with the routine workflows of treating physicians. However, the tendency toward more aggressive treatment recommendations highlights the importance of human oversight in clinical decision-making. Future studies should focus on prospective validations and exploring the safe integration of such artificial intelligence tools into clinical practice. ", doi="10.2196/60391", url="https://ai.jmir.org/2025/1/e60391", url="http://www.ncbi.nlm.nih.gov/pubmed/40053715" } @Article{info:doi/10.2196/64349, author="Elvas, B. Luis and Almeida, Ana and Ferreira, C. Joao", title="The Role of AI in Cardiovascular Event Monitoring and Early Detection: Scoping Literature Review", journal="JMIR Med Inform", year="2025", month="Mar", day="6", volume="13", pages="e64349", keywords="artificial intelligence", keywords="machine learning", keywords="cardiovascular diseases", keywords="cardiovascular events", keywords="health care", keywords="monitoring", keywords="early detection", keywords="AI", keywords="cardiovascular", keywords="literature review", keywords="medical data", keywords="detect", keywords="patient outcomes", keywords="neural network", keywords="ML model", keywords="mobile phone", abstract="Background: Artificial intelligence (AI) has shown exponential growth and advancements, revolutionizing various fields, including health care. However, domain adaptation remains a significant challenge, as machine learning (ML) models often need to be applied across different health care settings with varying patient demographics and practices. This issue is critical for ensuring effective and equitable AI deployment. Cardiovascular diseases (CVDs), the leading cause of global mortality with 17.9 million annual deaths, encompass conditions like coronary heart disease and hypertension. The increasing availability of medical data, coupled with AI advancements, offers new opportunities for early detection and intervention in cardiovascular events, leveraging AI's capacity to analyze complex datasets and uncover critical patterns. Objective: This review aims to examine AI methodologies combined with medical data to advance the intelligent monitoring and detection of CVDs, identifying areas for further research to enhance patient outcomes and support early interventions. Methods: This review follows the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) methodology to ensure a rigorous and transparent literature review process. This structured approach facilitated a comprehensive overview of the current state of research in this field. Results: Through the methodology used, 64 documents were retrieved, of which 40 documents met the inclusion criteria. The reviewed papers demonstrate advancements in AI and ML for CVD detection, classification, prediction, diagnosis, and patient monitoring. Techniques such as ensemble learning, deep neural networks, and feature selection improve prediction accuracy over traditional methods. ML models predict cardiovascular events and risks, with applications in monitoring via wearable technology. The integration of AI in health care supports early detection, personalized treatment, and risk assessment, possibly improving the management of CVDs. Conclusions: The study concludes that AI and ML techniques can improve the accuracy of CVD classification, prediction, diagnosis, and monitoring. The integration of multiple data sources and noninvasive methods supports continuous monitoring and early detection. These advancements help enhance CVD management and patient outcomes, indicating the potential for AI to offer more precise and cost-effective solutions in health care. ", doi="10.2196/64349", url="https://medinform.jmir.org/2025/1/e64349" } @Article{info:doi/10.2196/66622, author="Zhang, Subo and Zhu, Zhitao and Yu, Zhenfei and Sun, Haifeng and Sun, Yi and Huang, Hai and Xu, Lei and Wan, Jinxin", title="Effectiveness of AI for Enhancing Computed Tomography Image Quality and Radiation Protection in Radiology: Systematic Review and Meta-Analysis", journal="J Med Internet Res", year="2025", month="Feb", day="27", volume="27", pages="e66622", keywords="artificial intelligence", keywords="computed tomography", keywords="image quality", keywords="radiation protection", keywords="meta-analysis", abstract="Background: Artificial intelligence (AI) presents a promising approach to balancing high image quality with reduced radiation exposure in computed tomography (CT) imaging. Objective: This meta-analysis evaluates the effectiveness of AI in enhancing CT image quality and lowering radiation doses. Methods: A thorough literature search was performed across several databases, including PubMed, Embase, Web of Science, Science Direct, and Cochrane Library, with the final update in 2024. We included studies that compared AI-based interventions to conventional CT techniques. The quality of these studies was assessed using the Newcastle-Ottawa Scale. Random effect models were used to pool results, and heterogeneity was measured using the I{\texttwosuperior} statistic. Primary outcomes included image quality, CT dose index, and diagnostic accuracy. Results: This meta-analysis incorporated 5 clinical validation studies published between 2022 and 2024, totaling 929 participants. Results indicated that AI-based interventions significantly improved image quality (mean difference 0.70, 95\% CI 0.43-0.96; P<.001) and showed a positive trend in reducing the CT dose index, though not statistically significant (mean difference 0.47, 95\% CI --0.21 to 1.15; P=.18). AI also enhanced image analysis efficiency (odds ratio 1.57, 95\% CI 1.08-2.27; P=.02) and demonstrated high accuracy and sensitivity in detecting intracranial aneurysms, with low-dose CT using AI reconstruction showing noninferiority for liver lesion detection. Conclusions: The findings suggest that AI-based interventions can significantly enhance CT imaging practices by improving image quality and potentially reducing radiation doses, which may lead to better diagnostic accuracy and patient safety. However, these results should be interpreted with caution due to the limited number of studies and the variability in AI algorithms. Further research is needed to clarify AI's impact on radiation reduction and to establish clinical standards. ", doi="10.2196/66622", url="https://www.jmir.org/2025/1/e66622", url="http://www.ncbi.nlm.nih.gov/pubmed/40053787" } @Article{info:doi/10.2196/53892, author="Cabral, Pereira Bernardo and Braga, Maciel Luiza Amara and Conte Filho, Gilbert Carlos and Penteado, Bruno and Freire de Castro Silva, Luis Sandro and Castro, Leonardo and Fornazin, Marcelo and Mota, Fabio", title="Future Use of AI in Diagnostic Medicine: 2-Wave Cross-Sectional Survey Study", journal="J Med Internet Res", year="2025", month="Feb", day="27", volume="27", pages="e53892", keywords="artificial intelligence", keywords="AI", keywords="diagnostic medicine", keywords="survey research", keywords="researcher opinion", keywords="future", abstract="Background: The rapid evolution of artificial intelligence (AI) presents transformative potential for diagnostic medicine, offering opportunities to enhance diagnostic accuracy, reduce costs, and improve patient outcomes. Objective: This study aimed to assess the expected future impact of AI on diagnostic medicine by comparing global researchers' expectations using 2 cross-sectional surveys. Methods: The surveys were conducted in September 2020 and February 2023. Each survey captured a 10-year projection horizon, gathering insights from >3700 researchers with expertise in AI and diagnostic medicine from all over the world. The survey sought to understand the perceived benefits, integration challenges, and evolving attitudes toward AI use in diagnostic settings. Results: Results indicated a strong expectation among researchers that AI will substantially influence diagnostic medicine within the next decade. Key anticipated benefits include enhanced diagnostic reliability, reduced screening costs, improved patient care, and decreased physician workload, addressing the growing demand for diagnostic services outpacing the supply of medical professionals. Specifically, x-ray diagnosis, heart rhythm interpretation, and skin malignancy detection were identified as the diagnostic tools most likely to be integrated with AI technologies due to their maturity and existing AI applications. The surveys highlighted the growing optimism regarding AI's ability to transform traditional diagnostic pathways and enhance clinical decision-making processes. Furthermore, the study identified barriers to the integration of AI in diagnostic medicine. The primary challenges cited were the difficulties of embedding AI within existing clinical workflows, ethical and regulatory concerns, and data privacy issues. Respondents emphasized uncertainties around legal responsibility and accountability for AI-supported clinical decisions, data protection challenges, and the need for robust regulatory frameworks to ensure safe AI deployment. Ethical concerns, particularly those related to algorithmic transparency and bias, were noted as increasingly critical, reflecting a heightened awareness of the potential risks associated with AI adoption in clinical settings. Differences between the 2 survey waves indicated a growing focus on ethical and regulatory issues, suggesting an evolving recognition of these challenges over time. Conclusions: Despite these barriers, there was notable consistency in researchers' expectations across the 2 survey periods, indicating a stable and sustained outlook on AI's transformative potential in diagnostic medicine. The findings show the need for interdisciplinary collaboration among clinicians, AI developers, and regulators to address ethical and practical challenges while maximizing AI's benefits. This study offers insights into the projected trajectory of AI in diagnostic medicine, guiding stakeholders, including health care providers, policy makers, and technology developers, on navigating the opportunities and challenges of AI integration. ", doi="10.2196/53892", url="https://www.jmir.org/2025/1/e53892", url="http://www.ncbi.nlm.nih.gov/pubmed/40053779" } @Article{info:doi/10.2196/52358, author="Ejaz, Hamza and Tsui, Keith Hon Lung and Patel, Mehul and Ulloa Paredes, Rafael Luis and Knights, Ellen and Aftab, Bakht Shah and Subbe, Peter Christian", title="Comparison of a Novel Machine Learning--Based Clinical Query Platform With Traditional Guideline Searches for Hospital Emergencies: Prospective Pilot Study of User Experience and Time Efficiency", journal="JMIR Hum Factors", year="2025", month="Feb", day="25", volume="12", pages="e52358", keywords="artificial intelligence", keywords="machine learning", keywords="information search", keywords="emergency care", keywords="developing", keywords="testing", keywords="information retrieval", keywords="hospital care", keywords="training", keywords="clinical practice", keywords="clinical experience", keywords="user satisfaction", keywords="clinical impact", keywords="user group", keywords="users", keywords="study design", keywords="mobile phone", abstract="Background: Emergency and acute medicine doctors require easily accessible evidence-based information to safely manage a wide range of clinical presentations. The inability to find evidence-based local guidelines on the trust's intranet leads to information retrieval from the World Wide Web. Artificial intelligence (AI) has the potential to make evidence-based information retrieval faster and easier. Objective: The aim of the study is to conduct a time-motion analysis, comparing cohorts of junior doctors using (1) an AI-supported search engine versus (2) the traditional hospital intranet. The study also aims to examine the impact of the AI-supported search engine on the duration of searches and workflow when seeking answers to clinical queries at the point of care. Methods: This pre- and postobservational study was conducted in 2 phases. In the first phase, clinical information searches by 10 doctors caring for acutely unwell patients in acute medicine were observed during 10 working days. Based on these findings and input from a focus group of 14 clinicians, an AI-supported, context-sensitive search engine was implemented. In the second phase, clinical practice was observed for 10 doctors for an additional 10 working days using the new search engine. Results: The hospital intranet group (n=10) had a median of 23 months of clinical experience, while the AI-supported search engine group (n=10) had a median of 54 months. Participants using the AI-supported engine conducted fewer searches. User satisfaction and query resolution rates were similar between the 2 phases. Searches with the AI-supported engine took 43 seconds longer on average. Clinicians rated the new app with a favorable Net Promoter Score of 20. Conclusions: We report a successful feasibility pilot of an AI-driven search engine for clinical guidelines. Further development of the engine including the incorporation of large language models might improve accuracy and speed. More research is required to establish clinical impact in different user groups. Focusing on new staff at beginning of their post might be the most suitable study design. ", doi="10.2196/52358", url="https://humanfactors.jmir.org/2025/1/e52358" } @Article{info:doi/10.2196/68347, author="Hadar-Shoval, Dorit and Lvovsky, Maya and Asraf, Kfir and Shimoni, Yoav and Elyoseph, Zohar", title="The Feasibility of Large Language Models in Verbal Comprehension Assessment: Mixed Methods Feasibility Study", journal="JMIR Form Res", year="2025", month="Feb", day="24", volume="9", pages="e68347", keywords="large language models", keywords="verbal comprehension assessment", keywords="artificial intelligence", keywords="AI in psychodiagnostics", keywords="personalized intelligence tests", keywords="verbal comprehension index", keywords="Wechsler Adult Intelligence Scale", keywords="WAIS-III", keywords="psychological test validity", keywords="ethics in computerized cognitive assessment", abstract="Background: Cognitive assessment is an important component of applied psychology, but limited access and high costs make these evaluations challenging. Objective: This study aimed to examine the feasibility of using large language models (LLMs) to create personalized artificial intelligence--based verbal comprehension tests (AI-BVCTs) for assessing verbal intelligence, in contrast with traditional assessment methods based on standardized norms. Methods: We used a within-participants design, comparing scores obtained from AI-BVCTs with those from the Wechsler Adult Intelligence Scale (WAIS-III) verbal comprehension index (VCI). In total, 8 Hebrew-speaking participants completed both the VCI and AI-BVCT, the latter being generated using the LLM Claude. Results: The concordance correlation coefficient (CCC) demonstrated strong agreement between AI-BVCT and VCI scores (Claude: CCC=.75, 90\% CI 0.266-0.933; GPT-4: CCC=.73, 90\% CI 0.170-0.935). Pearson correlations further supported these findings, showing strong associations between VCI and AI-BVCT scores (Claude: r=.84, P<.001; GPT-4: r=.77, P=.02). No statistically significant differences were found between AI-BVCT and VCI scores (P>.05). Conclusions: These findings support the potential of LLMs to assess verbal intelligence. The study attests to the promise of AI-based cognitive tests in increasing the accessibility and affordability of assessment processes, enabling personalized testing. The research also raises ethical concerns regarding privacy and overreliance on AI in clinical work. Further research with larger and more diverse samples is needed to establish the validity and reliability of this approach and develop more accurate scoring procedures. ", doi="10.2196/68347", url="https://formative.jmir.org/2025/1/e68347" } @Article{info:doi/10.2196/58670, author="Gao, Yanjun and Li, Ruizhe and Croxford, Emma and Caskey, John and Patterson, W. Brian and Churpek, Matthew and Miller, Timothy and Dligach, Dmitriy and Afshar, Majid", title="Leveraging Medical Knowledge Graphs Into Large Language Models for Diagnosis Prediction: Design and Application Study", journal="JMIR AI", year="2025", month="Feb", day="24", volume="4", pages="e58670", keywords="knowledge graph", keywords="natural language processing", keywords="machine learning", keywords="electronic health record", keywords="large language model", keywords="diagnosis prediction", keywords="graph model", keywords="artificial intelligence", abstract="Background: Electronic health records (EHRs) and routine documentation practices play a vital role in patients' daily care, providing a holistic record of health, diagnoses, and treatment. However, complex and verbose EHR narratives can overwhelm health care providers, increasing the risk of diagnostic inaccuracies. While large language models (LLMs) have showcased their potential in diverse language tasks, their application in health care must prioritize the minimization of diagnostic errors and the prevention of patient harm. Integrating knowledge graphs (KGs) into LLMs offers a promising approach because structured knowledge from KGs could enhance LLMs' diagnostic reasoning by providing contextually relevant medical information. Objective: This study introduces DR.KNOWS (Diagnostic Reasoning Knowledge Graph System), a model that integrates Unified Medical Language System--based KGs with LLMs to improve diagnostic predictions from EHR data by retrieving contextually relevant paths aligned with patient-specific information. Methods: DR.KNOWS combines a stack graph isomorphism network for node embedding with an attention-based path ranker to identify and rank knowledge paths relevant to a patient's clinical context. We evaluated DR.KNOWS on 2 real-world EHR datasets from different geographic locations, comparing its performance to baseline models, including QuickUMLS and standard LLMs (Text-to-Text Transfer Transformer and ChatGPT). To assess diagnostic reasoning quality, we designed and implemented a human evaluation framework grounded in clinical safety metrics. Results: DR.KNOWS demonstrated notable improvements over baseline models, showing higher accuracy in extracting diagnostic concepts and enhanced diagnostic prediction metrics. Prompt-based fine-tuning of Text-to-Text Transfer Transformer with DR.KNOWS knowledge paths achieved the highest ROUGE-L (Recall-Oriented Understudy for Gisting Evaluation--Longest Common Subsequence) and concept unique identifier F1-scores, highlighting the benefits of KG integration. Human evaluators found the diagnostic rationales of DR.KNOWS to be aligned strongly with correct clinical reasoning, indicating improved abstraction and reasoning. Recognized limitations include potential biases within the KG data, which we addressed by emphasizing case-specific path selection and proposing future bias-mitigation strategies. Conclusions: DR.KNOWS offers a robust approach for enhancing diagnostic accuracy and reasoning by integrating structured KG knowledge into LLM-based clinical workflows. Although further work is required to address KG biases and extend generalizability, DR.KNOWS represents progress toward trustworthy artificial intelligence--driven clinical decision support, with a human evaluation framework focused on diagnostic safety and alignment with clinical standards. ", doi="10.2196/58670", url="https://ai.jmir.org/2025/1/e58670" } @Article{info:doi/10.2196/53026, author="Fang, Ruijie and Hosseini, Elahe and Zhang, Ruoyu and Fang, Chongzhou and Rafatirad, Setareh and Homayoun, Houman", title="Survey on Pain Detection Using Machine Learning Models: Narrative Review", journal="JMIR AI", year="2025", month="Feb", day="24", volume="4", pages="e53026", keywords="pain", keywords="pain assessment", keywords="machine learning", keywords="survey", keywords="mobile phone", abstract="Background: Pain, a leading reason people seek medical care, has become a social issue. Automated pain assessment has seen notable advancements over recent decades, addressing a critical need in both clinical and everyday settings. Objective: The objective of this survey was to provide a comprehensive overview of pain and its mechanisms, to explore existing research on automated pain recognition modalities, and to identify key challenges and future directions in this field. Methods: A literature review was conducted, analyzing studies focused on various modalities for automated pain recognition. The modalities reviewed include facial expressions, physiological signals, audio cues, and pupil dilation, with a focus on their efficacy and application in pain assessment. Results: The survey found that each modality offers unique contributions to automated pain recognition, with facial expressions and physiological signals showing particular promise. However, the reliability and accuracy of these modalities vary, often depending on factors such as individual variability and environmental conditions. Conclusions: While automated pain recognition has progressed considerably, challenges remain in achieving consistent accuracy across diverse populations and contexts. Future research directions are suggested to address these challenges, enhancing the reliability and applicability of automated pain assessment in clinical practice. ", doi="10.2196/53026", url="https://ai.jmir.org/2025/1/e53026" } @Article{info:doi/10.2196/63701, author="Hornstein, Silvan and Lueken, Ulrike and Wundrack, Richard and Hilbert, Kevin", title="Predicting Satisfaction With Chat-Counseling at a 24/7 Chat Hotline for the Youth: Natural Language Processing Study", journal="JMIR AI", year="2025", month="Feb", day="18", volume="4", pages="e63701", keywords="digital mental health", keywords="mental illness", keywords="mental disorder", keywords="adolescence", keywords="chat counseling", keywords="machine learning", keywords="artificial intelligence", keywords="large language model", keywords="natural language processing", keywords="deep learning", abstract="Background: Chat-based counseling services are popular for the low-threshold provision of mental health support to youth. In addition, they are particularly suitable for the utilization of natural language processing (NLP) for improved provision of care. Objective: Consequently, this paper evaluates the feasibility of such a use case, namely, the NLP-based automated evaluation of satisfaction with the chat interaction. This preregistered approach could be used for evaluation and quality control procedures, as it is particularly relevant for those services. Methods: The consultations of 2609 young chatters (around 140,000 messages) and corresponding feedback were used to train and evaluate classifiers to predict whether a chat was perceived as helpful or not. On the one hand, we trained a word vectorizer in combination with an extreme gradient boosting (XGBoost) classifier, applying cross-validation and extensive hyperparameter tuning. On the other hand, we trained several transformer-based models, comparing model types, preprocessing, and over- and undersampling techniques. For both model types, we selected the best-performing approach on the training set for a final performance evaluation on the 522 users in the final test set. Results: The fine-tuned XGBoost classifier achieved an area under the receiver operating characteristic score of 0.69 (P<.001), as well as a Matthews correlation coefficient of 0.25 on the previously unseen test set. The selected Longformer-based model did not outperform this baseline, scoring 0.68 (P=.69). A Shapley additive explanations explainability approach suggested that help seekers rating a consultation as helpful commonly expressed their satisfaction already within the conversation. In contrast, the rejection of offered exercises predicted perceived unhelpfulness. Conclusions: Chat conversations include relevant information regarding the perceived quality of an interaction that can be used by NLP-based prediction approaches. However, to determine if the moderate predictive performance translates into meaningful service improvements requires randomized trials. Further, our results highlight the relevance of contrasting pretrained models with simpler baselines to avoid the implementation of unnecessarily complex models. Trial Registration: Open Science Framework SR4Q9; https://osf.io/sr4q9 ", doi="10.2196/63701", url="https://ai.jmir.org/2025/1/e63701" } @Article{info:doi/10.2196/65146, author="Yang, Zhichao and Yao, Zonghai and Tasmin, Mahbuba and Vashisht, Parth and Jang, Seok Won and Ouyang, Feiyun and Wang, Beining and McManus, David and Berlowitz, Dan and Yu, Hong", title="Unveiling GPT-4V's hidden challenges behind high accuracy on USMLE questions: Observational Study", journal="J Med Internet Res", year="2025", month="Feb", day="7", volume="27", pages="e65146", keywords="artificial intelligence", keywords="natural language processing", keywords="large language model", keywords="LLM", keywords="ChatGPT", keywords="GPT", keywords="GPT-4V", keywords="USMLE", keywords="Medical License Exam", keywords="medical image interpretation", keywords="United States Medical Licensing Examination", keywords="NLP", abstract="Background: Recent advancements in artificial intelligence, such as GPT-3.5 Turbo (OpenAI) and GPT-4, have demonstrated significant potential by achieving good scores on text-only United States Medical Licensing Examination (USMLE) exams and effectively answering questions from physicians. However, the ability of these models to interpret medical images remains underexplored. Objective: This study aimed to comprehensively evaluate the performance, interpretability, and limitations of GPT-3.5 Turbo, GPT-4, and its successor, GPT-4 Vision (GPT-4V), specifically focusing on GPT-4V's newly introduced image-understanding feature. By assessing the models on medical licensing examination questions that require image interpretation, we sought to highlight the strengths and weaknesses of GPT-4V in handling complex multimodal clinical information, thereby exposing hidden flaws and providing insights into its readiness for integration into clinical settings. Methods: This cross-sectional study tested GPT-4V, GPT-4, and ChatGPT-3.5 Turbo on a total of 227 multiple-choice questions with images from USMLE Step 1 (n=19), Step 2 clinical knowledge (n=14), Step 3 (n=18), the Diagnostic Radiology Qualifying Core Exam (DRQCE) (n=26), and AMBOSS question banks (n=150). AMBOSS provided expert-written hints and question difficulty levels. GPT-4V's accuracy was compared with 2 state-of-the-art large language models, GPT-3.5 Turbo and GPT-4. The quality of the explanations was evaluated by choosing human preference between an explanation by GPT-4V (without hint), an explanation by an expert, or a tie, using 3 qualitative metrics: comprehensive explanation, question information, and image interpretation. To better understand GPT-4V's explanation ability, we modified a patient case report to resemble a typical ``curbside consultation'' between physicians. Results: For questions with images, GPT-4V achieved an accuracy of 84.2\%, 85.7\%, 88.9\%, and 73.1\% in Step 1, Step 2 clinical knowledge, Step 3 of USMLE, and DRQCE, respectively. It outperformed GPT-3.5 Turbo (42.1\%, 50\%, 50\%, 19.2\%) and GPT-4 (63.2\%, 64.3\%, 66.7\%, 26.9\%). When GPT-4V answered correctly, its explanations were nearly as good as those provided by domain experts from AMBOSS. However, incorrect answers often had poor explanation quality: 18.2\% (10/55) contained inaccurate text, 45.5\% (25/55) had inference errors, and 76.3\% (42/55) demonstrated image misunderstandings. With human expert assistance, GPT-4V reduced errors by an average of 40\% (22/55). GPT-4V accuracy improved with hints, maintaining stable performance across difficulty levels, while medical student performance declined as difficulty increased. In a simulated curbside consultation scenario, GPT-4V required multiple specific prompts to interpret complex case data accurately. Conclusions: GPT-4V achieved high accuracy on multiple-choice questions with images, highlighting its potential in medical assessments. However, significant shortcomings were observed in the quality of explanations when questions were answered incorrectly, particularly in the interpretation of images, which could not be efficiently resolved through expert interaction. These findings reveal hidden flaws in the image interpretation capabilities of GPT-4V, underscoring the need for more comprehensive evaluations beyond multiple-choice questions before integrating GPT-4V into clinical settings. ", doi="10.2196/65146", url="https://www.jmir.org/2025/1/e65146" } @Article{info:doi/10.2196/57319, author="Nielsen, Joshua and Chen, Xiaoyu and Davis, LaShara and Waterman, Amy and Gentili, Monica", title="Investigating the Classification of Living Kidney Donation Experiences on Reddit and Understanding the Sensitivity of ChatGPT to Prompt Engineering: Content Analysis", journal="JMIR AI", year="2025", month="Feb", day="7", volume="4", pages="e57319", keywords="prompt engineering", keywords="generative artificial intelligence", keywords="kidney donation", keywords="transplant", keywords="living donor", abstract="Background: Living kidney donation (LKD), where individuals donate one kidney while alive, plays a critical role in increasing the number of kidneys available for those experiencing kidney failure. Previous studies show that many generous people are interested in becoming living donors; however, a huge gap exists between the number of patients on the waiting list and the number of living donors yearly. Objective: To bridge this gap, we aimed to investigate how to identify potential living donors from discussions on public social media forums so that educational interventions could later be directed to them. Methods: Using Reddit forums as an example, this study described the classification of Reddit content shared about LKD into three classes: (1) present (presently dealing with LKD personally), (2) past (dealt with LKD personally in the past), and (3) other (LKD general comments). An evaluation was conducted comparing a fine-tuned distilled version of the Bidirectional Encoder Representations from Transformers (BERT) model with inference using GPT-3.5 (ChatGPT). To systematically evaluate ChatGPT's sensitivity to distinguishing between the 3 prompt categories, we used a comprehensive prompt engineering strategy encompassing a full factorial analysis in 48 runs. A novel prompt engineering approach, dialogue until classification consensus, was introduced to simulate a deliberation between 2 domain experts until a consensus on classification was achieved. Results: BERT and GPT-3.5 exhibited classification accuracies of approximately 75\% and 78\%, respectively. Recognizing the inherent ambiguity between classes, a post hoc analysis of incorrect predictions revealed sensible reasoning and acceptable errors in the predictive models. Considering these acceptable mismatched predictions, the accuracy improved to 89.3\% for BERT and 90.7\% for GPT-3.5. Conclusions: Large language models, such as GPT-3.5, are highly capable of detecting and categorizing LKD-targeted content on social media forums. They are sensitive to instructions, and the introduced dialogue until classification consensus method exhibited superior performance over stand-alone reasoning, highlighting the merit in advancing prompt engineering methodologies. The models can produce appropriate contextual reasoning, even when final conclusions differ from their human counterparts. ", doi="10.2196/57319", url="https://ai.jmir.org/2025/1/e57319", url="http://www.ncbi.nlm.nih.gov/pubmed/39918869" } @Article{info:doi/10.2196/60847, author="Choudhury, Ananya and Volmer, Leroy and Martin, Frank and Fijten, Rianne and Wee, Leonard and Dekker, Andre and Soest, van Johan", title="Advancing Privacy-Preserving Health Care Analytics and Implementation of the Personal Health Train: Federated Deep Learning Study", journal="JMIR AI", year="2025", month="Feb", day="6", volume="4", pages="e60847", keywords="gross tumor volume segmentation", keywords="federated learning infrastructure", keywords="privacy-preserving technology", keywords="cancer", keywords="deep learning", keywords="artificial intelligence", keywords="lung cancer", keywords="oncology", keywords="radiotherapy", keywords="imaging", keywords="data protection", keywords="data privacy", abstract="Background: The rapid advancement of deep learning in health care presents significant opportunities for automating complex medical tasks and improving clinical workflows. However, widespread adoption is impeded by data privacy concerns and the necessity for large, diverse datasets across multiple institutions. Federated learning (FL) has emerged as a viable solution, enabling collaborative artificial intelligence model development without sharing individual patient data. To effectively implement FL in health care, robust and secure infrastructures are essential. Developing such federated deep learning frameworks is crucial to harnessing the full potential of artificial intelligence while ensuring patient data privacy and regulatory compliance. Objective: The objective is to introduce an innovative FL infrastructure called the Personal Health Train (PHT) that includes the procedural, technical, and governance components needed to implement FL on real-world health care data, including training deep learning neural networks. The study aims to apply this federated deep learning infrastructure to the use case of gross tumor volume segmentation on chest computed tomography images of patients with lung cancer and present the results from a proof-of-concept experiment. Methods: The PHT framework addresses the challenges of data privacy when sharing data, by keeping data close to the source and instead bringing the analysis to the data. Technologically, PHT requires 3 interdependent components: ``tracks'' (protected communication channels), ``trains'' (containerized software apps), and ``stations'' (institutional data repositories), which are supported by the open source ``Vantage6'' software. The study applies this federated deep learning infrastructure to the use case of gross tumor volume segmentation on chest computed tomography images of patients with lung cancer, with the introduction of an additional component called the secure aggregation server, where the model averaging is done in a trusted and inaccessible environment. Results: We demonstrated the feasibility of executing deep learning algorithms in a federated manner using PHT and presented the results from a proof-of-concept study. The infrastructure linked 12 hospitals across 8 nations, covering 4 continents, demonstrating the scalability and global reach of the proposed approach. During the execution and training of the deep learning algorithm, no data were shared outside the hospital. Conclusions: The findings of the proof-of-concept study, as well as the implications and limitations of the infrastructure and the results, are discussed. The application of federated deep learning to unstructured medical imaging data, facilitated by the PHT framework and Vantage6 platform, represents a significant advancement in the field. The proposed infrastructure addresses the challenges of data privacy and enables collaborative model development, paving the way for the widespread adoption of deep learning--based tools in the medical domain and beyond. The introduction of the secure aggregation server implied that data leakage problems in FL can be prevented by careful design decisions of the infrastructure. Trial Registration: ClinicalTrials.gov NCT05775068; https://clinicaltrials.gov/study/NCT05775068 ", doi="10.2196/60847", url="https://ai.jmir.org/2025/1/e60847" } @Article{info:doi/10.2196/63065, author="Elhassan, Elwaleed Safia and Sajid, Raihan Muhammad and Syed, Mariam Amina and Fathima, Afreen Sidrah and Khan, Shehroz Bushra and Tamim, Hala", title="Assessing Familiarity, Usage Patterns, and Attitudes of Medical Students Toward ChatGPT and Other Chat-Based AI Apps in Medical Education: Cross-Sectional Questionnaire Study", journal="JMIR Med Educ", year="2025", month="Jan", day="30", volume="11", pages="e63065", keywords="ChatGPT", keywords="artificial intelligence", keywords="large language model", keywords="medical students", keywords="ethics", keywords="chat-based", keywords="AI apps", keywords="medical education", keywords="social media", keywords="attitude", keywords="AI", abstract="Background: There has been a rise in the popularity of ChatGPT and other chat-based artificial intelligence (AI) apps in medical education. Despite data being available from other parts of the world, there is a significant lack of information on this topic in medical education and research, particularly in Saudi Arabia. Objective: The primary objective of the study was to examine the familiarity, usage patterns, and attitudes of Alfaisal University medical students toward ChatGPT and other chat-based AI apps in medical education. Methods: This was a cross-sectional study conducted from October 8, 2023, through November 22, 2023. A questionnaire was distributed through social media channels to medical students at Alfaisal University who were 18 years or older. Current Alfaisal University medical students in years 1 through 6, of both genders, were exclusively targeted by the questionnaire. The study was approved by Alfaisal University Institutional Review Board. A $\chi$2 test was conducted to assess the relationships between gender, year of study, familiarity, and reasons for usage. Results: A total of 293 responses were received, of which 95 (32.4\%) were from men and 198 (67.6\%) were from women. There were 236 (80.5\%) responses from preclinical students and 57 (19.5\%) from clinical students, respectively. Overall, males (n=93, 97.9\%) showed more familiarity with ChatGPT compared to females (n=180, 90.09\%; P=.03). Additionally, males also used Google Bard and Microsoft Bing ChatGPT more than females (P<.001). Clinical-year students used ChatGPT significantly more for general writing purposes compared to preclinical students (P=.005). Additionally, 136 (46.4\%) students believed that using ChatGPT and other chat-based AI apps for coursework was ethical, 86 (29.4\%) were neutral, and 71 (24.2\%) considered it unethical (all Ps>.05). Conclusions: Familiarity with and usage of ChatGPT and other chat-based AI apps were common among the students of Alfaisal University. The usage patterns of these apps differ between males and females and between preclinical and clinical-year students. ", doi="10.2196/63065", url="https://mededu.jmir.org/2025/1/e63065" } @Article{info:doi/10.2196/64188, author="Jiang, Yiqun and Li, Qing and Huang, Yu-Li and Zhang, Wenli", title="Urgency Prediction for Medical Laboratory Tests Through Optimal Sparse Decision Tree: Case Study With Echocardiograms", journal="JMIR AI", year="2025", month="Jan", day="29", volume="4", pages="e64188", keywords="interpretable machine learning", keywords="urgency prediction", keywords="appointment scheduling", keywords="echocardiogram", keywords="health care management", abstract="Background: In the contemporary realm of health care, laboratory tests stand as cornerstone components, driving the advancement of precision medicine. These tests offer intricate insights into a variety of medical conditions, thereby facilitating diagnosis, prognosis, and treatments. However, the accessibility of certain tests is hindered by factors such as high costs, a shortage of specialized personnel, or geographic disparities, posing obstacles to achieving equitable health care. For example, an echocardiogram is a type of laboratory test that is extremely important and not easily accessible. The increasing demand for echocardiograms underscores the imperative for more efficient scheduling protocols. Despite this pressing need, limited research has been conducted in this area. Objective: The study aims to develop an interpretable machine learning model for determining the urgency of patients requiring echocardiograms, thereby aiding in the prioritization of scheduling procedures. Furthermore, this study aims to glean insights into the pivotal attributes influencing the prioritization of echocardiogram appointments, leveraging the high interpretability of the machine learning model. Methods: Empirical and predictive analyses have been conducted to assess the urgency of patients based on a large real-world echocardiogram appointment dataset (ie, 34,293 appointments) sourced from electronic health records encompassing administrative information, referral diagnosis, and underlying patient conditions. We used a state-of-the-art interpretable machine learning algorithm, the optimal sparse decision tree (OSDT), renowned for its high accuracy and interpretability, to investigate the attributes pertinent to echocardiogram appointments. Results: The method demonstrated satisfactory performance (F1-score=36.18\% with an improvement of 1.7\% and F2-score=28.18\% with an improvement of 0.79\% by the best-performing baseline model) in comparison to the best-performing baseline model. Moreover, due to its high interpretability, the results provide valuable medical insights regarding the identification of urgent patients for tests through the extraction of decision rules from the OSDT model. Conclusions: The method demonstrated state-of-the-art predictive performance, affirming its effectiveness. Furthermore, we validate the decision rules derived from the OSDT model by comparing them with established medical knowledge. These interpretable results (eg, attribute importance and decision rules from the OSDT model) underscore the potential of our approach in prioritizing patient urgency for echocardiogram appointments and can be extended to prioritize other laboratory test appointments using electronic health record data. ", doi="10.2196/64188", url="https://ai.jmir.org/2025/1/e64188", url="http://www.ncbi.nlm.nih.gov/pubmed/39879091" } @Article{info:doi/10.2196/58834, author="Voigt, Kelly and Sun, Yingtao and Patandin, Ayush and Hendriks, Johanna and Goossens, Hendrik Richard and Verhoef, Cornelis and Husson, Olga and Gr{\"u}nhagen, Dirk and Jung, Jiwon", title="A Machine Learning Approach Using Topic Modeling to Identify and Assess Experiences of Patients With Colorectal Cancer: Explorative Study", journal="JMIR Cancer", year="2025", month="Jan", day="27", volume="11", pages="e58834", keywords="colorectal cancer", keywords="forum", keywords="topic modeling", keywords="patient journey", keywords="patient experience", keywords="AI", keywords="machine learning", keywords="cancer care", keywords="cancer survivor", keywords="United States", keywords="quality of life", keywords="post", keywords="topic", keywords="artificial intelligence", abstract="Background: The rising number of cancer survivors and the shortage of health care professionals challenge the accessibility of cancer care. Health technologies are necessary for sustaining optimal patient journeys. To understand individuals' daily lives during their patient journey, qualitative studies are crucial. However, not all patients wish to share their stories with researchers. Objective: This study aims to identify and assess patient experiences on a large scale using a novel machine learning--supported approach, leveraging data from patient forums. Methods: Forum posts of patients with colorectal cancer (CRC) from the Cancer Survivors Network USA were used as the data source. Topic modeling, as a part of machine learning, was used to recognize the topic patterns in the posts. Researchers read the most relevant 50 posts on each topic, dividing them into ``home'' or ``hospital'' contexts. A patient community journey map, derived from patients stories, was developed to visually illustrate our findings. CRC medical doctors and a quality-of-life expert evaluated the identified topics of patient experience and the map. Results: Based on 212,107 posts, 37 topics and 10 upper clusters were produced. Dominant clusters included ``Daily activities while living with CRC'' (38,782, 18.3\%) and ``Understanding treatment including alternatives and adjuvant therapy'' (31,577, 14.9\%). Topics related to the home context had more emotional content compared with the hospital context. The patient community journey map was constructed based on these findings. Conclusions: Our study highlighted the diverse concerns and experiences of patients with CRC. The more emotional content in home context discussions underscores the personal impact of CRC beyond clinical settings. Based on our study, we found that a machine learning-supported approach is a promising solution to analyze patients' experiences. The innovative application of patient community journey mapping provides a unique perspective into the challenges in patients' daily lives, which is essential for delivering appropriate support at the right moment. ", doi="10.2196/58834", url="https://cancer.jmir.org/2025/1/e58834" } @Article{info:doi/10.2196/64993, author="Biro, Joshua and Handley, L. Jessica and Cobb, K. Nathan and Kottamasu, Varsha and Collins, Jeffrey and Krevat, Seth and Ratwani, M. Raj", title="Accuracy and Safety of AI-Enabled Scribe Technology: Instrument Validation Study", journal="J Med Internet Res", year="2025", month="Jan", day="27", volume="27", pages="e64993", keywords="artificial intelligence", keywords="AI", keywords="patient safety", keywords="ambient digital scribe", keywords="AI-enabled scribe technology", keywords="AI scribe technology", keywords="scribe technology", keywords="accuracy", keywords="safety", keywords="ambient scribe", keywords="digital scribe", keywords="patient-clinician", keywords="patient-clinician communication", keywords="doctor-patient relationship", keywords="doctor-patient communication", keywords="patient engagement", keywords="dialogue script", keywords="scribe", doi="10.2196/64993", url="https://www.jmir.org/2025/1/e64993" } @Article{info:doi/10.2196/64649, author="Liu, Weiqi and Wu, You and Zheng, Zhuozhao and Bittle, Mark and Yu, Wei and Kharrazi, Hadi", title="Enhancing Diagnostic Accuracy of Lung Nodules in Chest Computed Tomography Using Artificial Intelligence: Retrospective Analysis", journal="J Med Internet Res", year="2025", month="Jan", day="27", volume="27", pages="e64649", keywords="artificial intelligence", keywords="diagnostic accuracy", keywords="lung nodule", keywords="radiology", keywords="AI system", abstract="Background: Uncertainty in the diagnosis of lung nodules is a challenge for both patients and physicians. Artificial intelligence (AI) systems are increasingly being integrated into medical imaging to assist diagnostic procedures. However, the accuracy of AI systems in identifying and measuring lung nodules on chest computed tomography (CT) scans remains unclear, which requires further evaluation. Objective: This study aimed to evaluate the impact of an AI-assisted diagnostic system on the diagnostic efficiency of radiologists. It specifically examined the report modification rates and missed and misdiagnosed rates of junior radiologists with and without AI assistance. Methods: We obtained effective data from 12,889 patients in 2 tertiary hospitals in Beijing before and after the implementation of the AI system, covering the period from April 2018 to March 2022. Diagnostic reports written by both junior and senior radiologists were included in each case. Using reports by senior radiologists as a reference, we compared the modification rates of reports written by junior radiologists with and without AI assistance. We further evaluated alterations in lung nodule detection capability over 3 years after the integration of the AI system. Evaluation metrics of this study include lung nodule detection rate, accuracy, false negative rate, false positive rate, and positive predictive value. The statistical analyses included descriptive statistics and chi-square, Cochran-Armitage, and Mann-Kendall tests. Results: The AI system was implemented in Beijing Anzhen Hospital (Hospital A) in January 2019 and Tsinghua Changgung Hospital (Hospital C) in June 2021. The modification rate of diagnostic reports in the detection of lung nodules increased from 4.73\% to 7.23\% ($\chi$21=12.15; P<.001) at Hospital A. In terms of lung nodule detection rates postimplementation, Hospital C increased from 46.19\% to 53.45\% ($\chi$21=25.48; P<.001) and Hospital A increased from 39.29\% to 55.22\% ($\chi$21=122.55; P<.001). At Hospital A, the false negative rate decreased from 8.4\% to 5.16\% ($\chi$21=9.85; P=.002), while the false positive rate increased from 2.36\% to 9.77\% ($\chi$21=53.48; P<.001). The detection accuracy demonstrated a decrease from 93.33\% to 92.23\% for Hospital A and from 95.27\% to 92.77\% for Hospital C. Regarding the changes in lung nodule detection capability over a 3-year period following the integration of the AI system, the detection rates for lung nodules exhibited a modest increase from 54.6\% to 55.84\%, while the overall accuracy demonstrated a slight improvement from 92.79\% to 93.92\%. Conclusions: The AI system enhanced lung nodule detection, offering the possibility of earlier disease identification and timely intervention. Nevertheless, the initial reduction in accuracy underscores the need for standardized diagnostic criteria and comprehensive training for radiologists to maximize the effectiveness of AI-enabled diagnostic systems. ", doi="10.2196/64649", url="https://www.jmir.org/2025/1/e64649" } @Article{info:doi/10.2196/55427, author="Marcolino, Soriano Milena and Oliveira, Ramos Lucca Fagundes and Valle, Rocha Lucas and Rosa, Santa Luiza Marinho Motta and Reis, Nogueira Zilma Silveira and Soares, Castro Thiago Barbabela de and Bernardino, Almeida Elid{\'e}a L{\'u}cia and Cordeiro, Almeida Raniere Alislan and Prates, Oliveira Raquel and Campos, Montenegro Mario Fernando", title="Sign Language Recognition System for Deaf Patients: Protocol for a Systematic Review", journal="JMIR Res Protoc", year="2025", month="Jan", day="23", volume="14", pages="e55427", keywords="computer neural networks", keywords="artificial intelligence", keywords="biomedical technology", keywords="communication aids for disabled", keywords="computer vision", keywords="sign language", keywords="hearing loss", keywords="deaf people", keywords="communication barriers", keywords="gestures", abstract="Background: Individuals with hearing impairments may face hindrances in health care assistance, which may significantly impact the prognosis and the incidence of complications and iatrogenic events. Therefore, the development of automatic communication systems to assist the interaction between this population and health care workers is paramount. Objective: This study aims to systematically review the evidence on communication systems using human-computer interaction techniques developed for deaf people who communicate through sign language that are already in use or proposed for use in health care contexts and have been tested with human users or videos of human users. Methods: A systematic review will be performed based on a literature search in MEDLINE, Web of Science, ACM, and IEEE Xplore as well as top-tiered conferences in the area to identify relevant studies. The inclusion criteria are the description of the development of a sign language recognition system in a health care context and the testing with human users. Independent investigators (LFRO, LRV, and LMMSR) will screen eligible studies, and disagreements will be solved by a senior researcher (MSM). The included papers will undergo full-text screening. A PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analysis) flow diagram will be presented to visually summarize the screening process, ensuring clarity and transparency in presenting the results. Additionally, a comprehensive chart table will be constructed to consolidate essential data related to the key variables extracted from the studies. These results will be meticulously analyzed and presented descriptively, offering insightful interpretations of the information encapsulated within the table. Results: A preliminary search was performed in April 2024. Researchers concluded the study selection by July 2024. Data extraction, synthesis, report, and recommendations are expected to be finished by February 2025. Conclusions: This systematic review will identify human-machine systems that enable communication in health services involving deaf patients, presenting the framework that includes usability and application in human contexts. We will present a comprehensive panel of findings, highlighting systems used to tackle communication barriers and offer a narrative comparison of current implementation practices. International Registered Report Identifier (IRRID): PRR1-10.2196/55427 ", doi="10.2196/55427", url="https://www.researchprotocols.org/2025/1/e55427", url="http://www.ncbi.nlm.nih.gov/pubmed/39847417" } @Article{info:doi/10.2196/58177, author="Kim, Hoon Ji and Kim, Joung Min and Kim, Chang Hyeon and Kim, Yan Ha and Sung, Min Ji and Chang, Hyuk-Jae", title="A Novel Artificial Intelligence--Enhanced Digital Network for Prehospital Emergency Support: Community Intervention Study", journal="J Med Internet Res", year="2025", month="Jan", day="23", volume="27", pages="e58177", keywords="emergency patient transport", keywords="transport time", keywords="artificial intelligence", keywords="smartphone", keywords="mobile phone", abstract="Background: Efficient emergency patient transport systems, which are crucial for delivering timely medical care to individuals in critical situations, face certain challenges. To address this, CONNECT-AI (CONnected Network for EMS Comprehensive Technical-Support using Artificial Intelligence), a novel digital platform, was introduced. This artificial intelligence (AI)--based network provides comprehensive technical support for the real-time sharing of medical information at the prehospital stage. Objective: This study aimed to evaluate the effectiveness of this system in reducing patient transport delays. Methods: The CONNECT-AI system provided 3 key AI services to prehospital care providers by collecting real-time patient data from the scene and hospital resource information, such as bed occupancy and the availability of emergency surgeries or procedures, using 5G communication technology and internet of things devices. These services included guidance on first aid, prediction of critically ill patients, and recommendation of the optimal transfer hospital. In addition, the platform offered emergency department medical staff real-time clinical information, including live video of patients during transport to the hospital. This community-based, nonrandomized controlled intervention study was designed to evaluate the effectiveness of the CONNECT-AI system in 2 regions of South Korea, each of which operated an intervention and a control period, each lasting 16 weeks. The impact of the system was assessed based on the proportion of patients experiencing transfer delays. Results: A total of 14,853 patients transported by public ambulance were finally selected for analysis. Overall, the median transport time was 10 (IQR 7-14) minutes in the intervention group and 9 (IQR 6-13) minutes in the control group. When comparing the incidence of transport time outliers (>75\%), which was the primary outcome of this study, the rate was higher in the intervention group in region 1, but significantly reduced in region 2, with the overall outlier rate being higher in the intervention group (27.5\%-29.7\%, P=.04). However, for patients with fever or respiratory symptoms, the group using the system showed a statistically significant reduction in outlier cases (36.5\%-30.1\%, P=.01). For patients who received real-time acceptance signals from the hospital, the reduction in the percentage of 75\% outliers was statistically significant compared with those without the system (27.5\%-19.6\%, P=.02). As a result of emergency department treatment, 1.5\% of patients in the control group and 1.1\% in the intervention group died (P=.14). In the system-guided optimal hospital transfer group, the mortality rate was significantly lower than in the control group (1.54\%-0.64\%, P=.01). Conclusions: The present digital emergency medical system platform offers a novel approach to enhancing emergency patient transport by leveraging AI, real-time information sharing, and decision support. While the system demonstrated improvements for certain patient groups facing transfer challenges, further research and modifications are necessary to fully realize its benefits in diverse health care contexts. Trial Registration: ClinicalTrials.gov NCT04829279; https://clinicaltrials.gov/study/NCT04829279 ", doi="10.2196/58177", url="https://www.jmir.org/2025/1/e58177", url="http://www.ncbi.nlm.nih.gov/pubmed/39847421" } @Article{info:doi/10.2196/67143, author="Eisinger, Felix and Holderried, Friederike and Mahling, Moritz and Stegemann--Philipps, Christian and Herrmann--Werner, Anne and Nazarenus, Eric and Sonanini, Alessandra and Guthoff, Martina and Eickhoff, Carsten and Holderried, Martin", title="What's Going On With Me and How Can I Better Manage My Health? The Potential of GPT-4 to Transform Discharge Letters Into Patient-Centered Letters to Enhance Patient Safety: Prospective, Exploratory Study", journal="J Med Internet Res", year="2025", month="Jan", day="21", volume="27", pages="e67143", keywords="GPT-4", keywords="patient letters", keywords="health care communication", keywords="artificial intelligence", keywords="patient safety", keywords="patient education", abstract="Background: For hospitalized patients, the discharge letter serves as a crucial source of medical information, outlining important discharge instructions and health management tasks. However, these letters are often written in professional jargon, making them difficult for patients with limited medical knowledge to understand. Large language models, such as GPT, have the potential to transform these discharge summaries into patient-friendly letters, improving accessibility and understanding. Objective: This study aims to use GPT-4 to convert discharge letters into more readable patient-centered letters. We evaluated how effectively and comprehensively GPT-4 identified and transferred patient safety--relevant information from the discharge letters to the transformed patient letters. Methods: Three discharge letters were created based on common medical conditions, containing 72 patient safety--relevant pieces of information, referred to as ``learning objectives.'' GPT-4 was prompted to transform these discharge letters into patient-centered letters. The resulting patient letters were analyzed for medical accuracy, patient centricity, and the ability to identify and translate the learning objectives. Bloom's taxonomy was applied to analyze and categorize the learning objectives. Results: GPT-4 addressed the majority (56/72, 78\%) of the learning objectives from the discharge letters. However, 11 of the 72 (15\%) learning objectives were not included in the majority of the patient-centered letters. A qualitative analysis based on Bloom's taxonomy revealed that learning objectives in the ``Understand'' category (9/11) were more frequently omitted than those in the ``Remember'' category (2/11). Most of the missing learning objectives were related to the content field of ``prevention of complications.'' By contrast, learning objectives regarding ``lifestyle'' and ``organizational'' aspects were addressed more frequently. Medical errors were found in a small proportion of sentences (31/787, 3.9\%). In terms of patient centricity, the patient-centered letters demonstrated better readability than the discharge letters. Compared with discharge letters, they included fewer medical terms (132/860, 15.3\%, vs 165/273, 60/4\%), fewer abbreviations (43/860, 5\%, vs 49/273, 17.9\%), and more explanations of medical terms (121/131, 92.4\%, vs 0/165, 0\%). Conclusions: Our study demonstrates that GPT-4 has the potential to transform discharge letters into more patient-centered communication. While the readability and patient centricity of the transformed letters are well-established, they do not fully address all patient safety--relevant information, resulting in the omission of key aspects. Further optimization of prompt engineering may help address this issue and improve the completeness of the transformation. ", doi="10.2196/67143", url="https://www.jmir.org/2025/1/e67143" } @Article{info:doi/10.2196/56039, author="Cho, Ukrae and Gwon, Nam Yong and Chong, Ryong Seung and Han, Yeon Ji and Kim, Kyung Do and Doo, Whan Seung and Yang, Jae Won and Kim, Kyeongmin and Shim, Ryul Sung and Jung, Jaehun and Kim, Heon Jae", title="Satisfactory Evaluation of Call Service Using AI After Ureteral Stent Insertion: Randomized Controlled Trial", journal="J Med Internet Res", year="2025", month="Jan", day="21", volume="27", pages="e56039", keywords="artificial intelligence", keywords="AI", keywords="ureteral stent", keywords="complications", keywords="randomized controlled trial", keywords="urologic procedures", keywords="urology", keywords="patients", keywords="information resources", abstract="Background: Ureteral stents, such as double-J stents, have become indispensable in urologic procedures but are associated with complications like hematuria and pain. While the advancement of artificial intelligence (AI) technology has led to its increasing application in the health sector, AI has not been used to provide information on potential complications and to facilitate subsequent measures in the event of such complications. Objective: This study aimed to assess the effectiveness of an AI-based prediction tool in providing patients with information about potential complications from ureteroscopy and ureteric stent placement and indicating the need for early additional therapy. Methods: Overall, 28 patients (aged 20-70 years) who underwent ureteral stent insertion for the first time without a history of psychological illness were consecutively included. A ``reassurance-call'' service was set up to equip patients with details about the procedure and postprocedure care, to monitor for complications and their severity. Patients were randomly allocated into 2 groups, reassurance-call by AI (group 1) and reassurance-call by humans (group 2). The primary outcome was the level of satisfaction with the reassurance-call service itself, measured using a Likert scale. Secondary outcomes included satisfaction with the AI-assisted reassurance-call service, also measured using a Likert scale, and the level of satisfaction (Likert scale and Visual Analogue Scale [VAS]) and anxiety (State-Trait Anxiety Inventory and VAS) related to managing complications for both groups. Results: Of the 28 recruited patients (14 in each group), 1 patient in group 2 dropped out. Baseline characteristics of patients showed no significant differences (all P>.05). Satisfaction with reassurance-call averaged 4.14 (SD 0.66; group 1) and 4.54 (SD 0.52; group 2), with no significant difference between AI and humans (P=.11). AI-assisted reassurance-call satisfaction averaged 3.43 (SD 0.94). Satisfaction about the management of complications using the Likert scale averaged 3.79 (SD 0.70) and 4.23 (SD 0.83), respectively, showing no significant difference (P=.14), but a significant difference was observed when using the VAS (P=.01), with 6.64 (SD 2.13) in group 1 and 8.69 (SD 1.80) in group 2. Anxiety about complications using the State-Trait Anxiety Inventory averaged 36.43 (SD 9.17) and 39.23 (SD 8.51; P=.33), while anxiety assessed with VAS averaged 4.86 (SD 2.28) and 3.46 (SD 3.38; P=.18), respectively, showing no significant differences. Multiple regression analysis was performed on all outcomes, and humans showed superior satisfaction than AI in the management of complications. Otherwise, most of the other variables showed no significant differences (P.>05). Conclusions: This is the first study to use AI for patient reassurance regarding complications after ureteric stent placement. The study found that patients were similarly satisfied for reassurance calls conducted by AI or humans. Further research in larger populations is warranted to confirm these findings. Trial Registration: Clinical Research Information System KCT0008062; https://tinyurl.com/4s8725w2 ", doi="10.2196/56039", url="https://www.jmir.org/2025/1/e56039" } @Article{info:doi/10.2196/58649, author="Zhang, Ren and Liu, Yi and Zhang, Zhiwei and Luo, Rui and Lv, Bin", title="Interpretable Machine Learning Model for Predicting Postpartum Depression: Retrospective Study", journal="JMIR Med Inform", year="2025", month="Jan", day="20", volume="13", pages="e58649", keywords="postpartum depression", keywords="machine learning", keywords="predictive model", keywords="risk factors", keywords="XGBoost", keywords="extreme gradient boosting", keywords="PPD", abstract="Background: Postpartum depression (PPD) is a prevalent mental health issue with significant impacts on mothers and families. Exploring reliable predictors is crucial for the early and accurate prediction of PPD, which remains challenging. Objective: This study aimed to comprehensively collect variables from multiple aspects, develop and validate machine learning models to achieve precise prediction of PPD, and interpret the model to reveal clinical implications. Methods: This study recruited pregnant women who delivered at the West China Second University Hospital, Sichuan University. Various variables were collected from electronic medical record data and screened using least absolute shrinkage and selection operator penalty regression. Participants were divided into training (1358/2055, 66.1\%) and validation (697/2055, 33.9\%) sets by random sampling. Machine learning--based predictive models were developed in the training cohort. Models were validated in the validation cohort with receiver operating curve and decision curve analysis. Multiple model interpretation methods were implemented to explain the optimal model. Results: We recruited 2055 participants in this study. The extreme gradient boosting model was the optimal predictive model with the area under the receiver operating curve of 0.849. Shapley Additive Explanation indicated that the most influential predictors of PPD were antepartum depression, lower fetal weight, elevated thyroid-stimulating hormone, declined thyroid peroxidase antibodies, elevated serum ferritin, and older age. Conclusions: This study developed and validated a machine learning--based predictive model for PPD. Several significant risk factors and how they impact the prediction of PPD were revealed. These findings provide new insights into the early screening of individuals with high risk for PPD, emphasizing the need for comprehensive screening approaches that include both physiological and psychological factors. ", doi="10.2196/58649", url="https://medinform.jmir.org/2025/1/e58649" } @Article{info:doi/10.2196/54121, author="Zhang, Haofuzi and Zou, Peng and Luo, Peng and Jiang, Xiaofan", title="Machine Learning for the Early Prediction of Delayed Cerebral Ischemia in Patients With Subarachnoid Hemorrhage: Systematic Review and Meta-Analysis", journal="J Med Internet Res", year="2025", month="Jan", day="20", volume="27", pages="e54121", keywords="machine learning", keywords="subarachnoid hemorrhage", keywords="delayed cerebral ischemia", keywords="systematic review", abstract="Background: Delayed cerebral ischemia (DCI) is a primary contributor to death after subarachnoid hemorrhage (SAH), with significant incidence. Therefore, early determination of the risk of DCI is an urgent need. Machine learning (ML) has received much attention in clinical practice. Recently, some studies have attempted to apply ML models for early noninvasive prediction of DCI. However, systematic evidence for its predictive accuracy is still lacking. Objective: The aim of this study was to synthesize the prediction accuracy of ML models for DCI to provide evidence for the development or updating of intelligent detection tools. Methods: PubMed, Cochrane, Embase, and Web of Science databases were systematically searched up to May 18, 2023. The risk of bias in the included studies was assessed using PROBAST (Prediction Model Risk of Bias Assessment Tool). During the analysis, we discussed the performance of different models in the training and validation sets. Results: We finally included 48 studies containing 16,294 patients with SAH and 71 ML models with logistic regression as the main model type. In the training set, the pooled concordance index (C index), sensitivity, and specificity of all the models were 0.786 (95\% CI 0.737-0.835), 0.77 (95\% CI 0.69-0.84), and 0.83 (95\% CI 0.75-0.89), respectively, while those of the logistic regression models were 0.770 (95\% CI 0.724-0.817), 0.75 (95\% CI 0.67-0.82), and 0.71 (95\% CI 0.63-0.78), respectively. In the validation set, the pooled C index, sensitivity, and specificity of all the models were 0.767 (95\% CI 0.741-0.793), 0.66 (95\% CI 0.53-0.77), and 0.78 (95\% CI 0.71-0.84), respectively, while those of the logistic regression models were 0.757 (95\% CI 0.715-0.800), 0.59 (95\% CI 0.57-0.80), and 0.80 (95\% CI 0.71-0.87), respectively. Conclusions: ML models appear to have relatively desirable power for early noninvasive prediction of DCI after SAH. However, enhancing the prediction sensitivity of these models is challenging. Therefore, efficient, noninvasive, or minimally invasive low-cost predictors should be further explored in future studies to improve the prediction accuracy of ML models. Trial Registration: PROSPERO (CRD42023438399); https://tinyurl.com/yfuuudde ", doi="10.2196/54121", url="https://www.jmir.org/2025/1/e54121" } @Article{info:doi/10.2196/64284, author="Wei, Boxiong", title="Performance Evaluation and Implications of Large Language Models in Radiology Board Exams: Prospective Comparative Analysis", journal="JMIR Med Educ", year="2025", month="Jan", day="16", volume="11", pages="e64284", keywords="large language models", keywords="LLM", keywords="artificial intelligence", keywords="AI", keywords="GPT-4", keywords="radiology exams", keywords="medical education", keywords="diagnostics", keywords="medical training", keywords="radiology", keywords="ultrasound", abstract="Background: Artificial intelligence advancements have enabled large language models to significantly impact radiology education and diagnostic accuracy. Objective: This study evaluates the performance of mainstream large language models, including GPT-4, Claude, Bard, Tongyi Qianwen, and Gemini Pro, in radiology board exams. Methods: A comparative analysis of 150 multiple-choice questions from radiology board exams without images was conducted. Models were assessed on their accuracy for text-based questions and were categorized by cognitive levels and medical specialties using $\chi$2 tests and ANOVA. Results: GPT-4 achieved the highest accuracy (83.3\%, 125/150), significantly outperforming all other models. Specifically, Claude achieved an accuracy of 62\% (93/150; P<.001), Bard 54.7\% (82/150; P<.001), Tongyi Qianwen 70.7\% (106/150; P=.009), and Gemini Pro 55.3\% (83/150; P<.001). The odds ratios compared to GPT-4 were 0.33 (95\% CI 0.18?0.60) for Claude, 0.24 (95\% CI 0.13?0.44) for Bard, and 0.25 (95\% CI 0.14?0.45) for Gemini Pro. Tongyi Qianwen performed relatively well with an accuracy of 70.7\% (106/150; P=0.02) and had an odds ratio of 0.48 (95\% CI 0.27?0.87) compared to GPT-4. Performance varied across question types and specialties, with GPT-4 excelling in both lower-order and higher-order questions, while Claude and Bard struggled with complex diagnostic questions. Conclusions: GPT-4 and Tongyi Qianwen show promise in medical education and training. The study emphasizes the need for domain-specific training datasets to enhance large language models' effectiveness in specialized fields like radiology. ", doi="10.2196/64284", url="https://mededu.jmir.org/2025/1/e64284" } @Article{info:doi/10.2196/57298, author="Kim, Taehwan and Choi, Jung-Yeon and Ko, Jin Myung and Kim, Kwang-il", title="Development and Validation of a Machine Learning Method Using Vocal Biomarkers for Identifying Frailty in Community-Dwelling Older Adults: Cross-Sectional Study", journal="JMIR Med Inform", year="2025", month="Jan", day="16", volume="13", pages="e57298", keywords="frailty", keywords="cross-sectional study", keywords="vocal biomarkers", keywords="older adults", keywords="artificial intelligence", keywords="machine learning", keywords="classification model", keywords="self-supervised", abstract="Background: The two most commonly used methods to identify frailty are the frailty phenotype and the frailty index. However, both methods have limitations in clinical application. In addition, methods for measuring frailty have not yet been standardized. Objective: We aimed to develop and validate a classification model for predicting frailty status using vocal biomarkers in community-dwelling older adults, based on voice recordings obtained from the picture description task (PDT). Methods: We recruited 127 participants aged 50 years and older and collected clinical information through a short form of the Comprehensive Geriatric Assessment scale. Voice recordings were collected with a tablet device during the Korean version of the PDT, and we preprocessed audio data to remove background noise before feature extraction. Three artificial intelligence (AI) models were developed for identifying frailty status: SpeechAI (using speech data only), DemoAI (using demographic data only), and DemoSpeechAI (combining both data types). Results: Our models were trained and evaluated on the basis of 5-fold cross-validation for 127 participants and compared. The SpeechAI model, using deep learning--based acoustic features, outperformed in terms of accuracy and area under the receiver operating characteristic curve (AUC), 80.4\% (95\% CI 76.89\%?83.91\%) and 0.89 (95\% CI 0.86?0.92), respectively, while the model using only demographics showed an accuracy of 67.96\% (95\% CI 67.63\%?68.29\%) and an AUC of 0.74 (95\% CI 0.73?0.75). The SpeechAI model outperformed the model using only demographics significantly in AUC (t4=8.705 [2-sided]; P<.001). The DemoSpeechAI model, which combined demographics with deep learning--based acoustic features, showed superior performance (accuracy 85.6\%, 95\% CI 80.03\%?91.17\% and AUC 0.93, 95\% CI 0.89?0.97), but there was no significant difference in AUC between the SpeechAI and DemoSpeechAI models (t4=1.057 [2-sided]; P=.35). Compared with models using traditional acoustic features from the openSMILE toolkit, the SpeechAI model demonstrated superior performance (AUC 0.89) over traditional methods (logistic regression: AUC 0.62; decision tree: AUC 0.57; random forest: AUC 0.66). Conclusions: Our findings demonstrate that vocal biomarkers derived from deep learning--based acoustic features can be effectively used to predict frailty status in community-dwelling older adults. The SpeechAI model showed promising accuracy and AUC, outperforming models based solely on demographic data or traditional acoustic features. Furthermore, while the combined DemoSpeechAI model showed slightly improved performance over the SpeechAI model, the difference was not statistically significant. These results suggest that speech-based AI models offer a noninvasive, scalable method for frailty detection, potentially streamlining assessments in clinical and community settings. ", doi="10.2196/57298", url="https://medinform.jmir.org/2025/1/e57298" } @Article{info:doi/10.2196/51319, author="Kim, JaeYong and Vajravelu, Narayan Bathri", title="Assessing the Current Limitations of Large Language Models in Advancing Health Care Education", journal="JMIR Form Res", year="2025", month="Jan", day="16", volume="9", pages="e51319", keywords="large language model", keywords="generative pretrained transformer", keywords="health care education", keywords="health care delivery", keywords="artificial intelligence", keywords="LLM", keywords="ChatGPT", keywords="AI", doi="10.2196/51319", url="https://formative.jmir.org/2025/1/e51319" } @Article{info:doi/10.2196/55235, author="Gabrielli, Silvia and Mayora Ibarra, Oscar and Forti, Stefano", title="A Holistic Digital Health Framework to Support Health Prevention Strategies in the First 1000 Days", journal="JMIR Pediatr Parent", year="2025", month="Jan", day="16", volume="8", pages="e55235", keywords="digital health", keywords="digital therapeutics", keywords="behavioral intervention technology", keywords="prevention", keywords="citizen science", keywords="first 1000 days", doi="10.2196/55235", url="https://pediatrics.jmir.org/2025/1/e55235" } @Article{info:doi/10.2196/50852, author="Bienefeld, Nadine and Keller, Emanuela and Grote, Gudela", title="AI Interventions to Alleviate Healthcare Shortages and Enhance Work Conditions in Critical Care: Qualitative Analysis", journal="J Med Internet Res", year="2025", month="Jan", day="13", volume="27", pages="e50852", keywords="artificial intelligence", keywords="AI", keywords="work design", keywords="sociotechnical system", keywords="work", keywords="job", keywords="occupational health", keywords="sociotechnical", keywords="new work", keywords="future of work", keywords="satisfaction", keywords="health care professionals", keywords="intensive care", keywords="ICU", keywords="stress mitigation", keywords="worker", keywords="employee", keywords="stress", keywords="health care professional", keywords="overburdened", keywords="burden", keywords="burnout", keywords="autonomy", keywords="competence", keywords="flexible", keywords="task", keywords="workplace", keywords="hospital", abstract="Background: The escalating global scarcity of skilled health care professionals is a critical concern, further exacerbated by rising stress levels and clinician burnout rates. Artificial intelligence (AI) has surfaced as a potential resource to alleviate these challenges. Nevertheless, it is not taken for granted that AI will inevitably augment human performance, as ill-designed systems may inadvertently impose new burdens on health care workers, and implementation may be challenging. An in-depth understanding of how AI can effectively enhance rather than impair work conditions is therefore needed. Objective: This research investigates the efficacy of AI in alleviating stress and enriching work conditions, using intensive care units (ICUs) as a case study. Through a sociotechnical system lens, we delineate how AI systems, tasks, and responsibilities of ICU nurses and physicians can be co-designed to foster motivating, resilient, and health-promoting work. Methods: We use the sociotechnical system framework COMPASS (Complementary Analysis of Sociotechnical Systems) to assess 5 job characteristics: autonomy, skill diversity, flexibility, problem-solving opportunities, and task variety. The qualitative analysis is underpinned by extensive workplace observation in 6 ICUs (approximately 559 nurses and physicians), structured interviews with work unit leaders (n=12), and a comparative analysis of data science experts' and clinicians' evaluation of the optimal levels of human-AI teaming. Results: The results indicate that AI holds the potential to positively impact work conditions for ICU nurses and physicians in four key areas. First, autonomy is vital for stress reduction, motivation, and performance improvement. AI systems that ensure transparency, predictability, and human control can reinforce or amplify autonomy. Second, AI can encourage skill diversity and competence development, thus empowering clinicians to broaden their skills, increase the polyvalence of tasks across professional boundaries, and improve interprofessional cooperation. However, careful consideration is required to avoid the deskilling of experienced professionals. Third, AI automation can expand flexibility by relieving clinicians from administrative duties, thereby concentrating their efforts on patient care. Remote monitoring and improved scheduling can help integrate work with other life domains. Fourth, while AI may reduce problem-solving opportunities in certain areas, it can open new pathways, particularly for nurses. Finally, task identity and variety are essential job characteristics for intrinsic motivation and worker engagement but could be compromised depending on how AI tools are designed and implemented. Conclusions: This study demonstrates AI's capacity to mitigate stress and improve work conditions for ICU nurses and physicians, thereby contributing to resolving health care staffing shortages. AI solutions that are thoughtfully designed in line with the principles for good work design can enhance intrinsic motivation, learning, and worker well-being, thus providing strategic value for hospital management, policy makers, and health care professionals alike. ", doi="10.2196/50852", url="https://www.jmir.org/2025/1/e50852" } @Article{info:doi/10.2196/55673, author="Merkel, Sebastian and Schorr, Sabrina", title="Identification of Use Cases, Target Groups, and Motivations Around Adopting Smart Speakers for Health Care and Social Care Settings: Scoping Review", journal="JMIR AI", year="2025", month="Jan", day="13", volume="4", pages="e55673", keywords="conversational agents", keywords="smart speaker", keywords="health care", keywords="social care", keywords="digitalization", keywords="scoping review", keywords="mobile phone", abstract="Background: Conversational agents (CAs) are finding increasing application in health and social care, not least due to their growing use in the home. Recent developments in artificial intelligence, machine learning, and natural language processing have enabled a variety of new uses for CAs. One type of CA that has received increasing attention recently is smart speakers. Objective: The aim of our study was to identify the use cases, user groups, and settings of smart speakers in health and social care. We also wanted to identify the key motivations for developers and designers to use this particular type of technology. Methods: We conducted a scoping review to provide an overview of the literature on smart speakers in health and social care. The literature search was conducted between February 2023 and March 2023 and included 3 databases (PubMed, Scopus, and Sociological Abstracts), supplemented by Google Scholar. Several keywords were used, including technology (eg, voice assistant), product name (eg, Amazon Alexa), and setting (health care or social care). Publications were included if they met the predefined inclusion criteria: (1) published after 2015 and (2) used a smart speaker in a health care or social care setting. Publications were excluded if they met one of the following criteria: (1) did not report on the specific devices used, (2) did not focus specifically on smart speakers, (3) were systematic reviews and other forms of literature-based publications, and (4) were not published in English. Two reviewers collected, reviewed, abstracted, and analyzed the data using qualitative content analysis. Results: A total of 27 articles were included in the final review. These articles covered a wide range of use cases in different settings, such as private homes, hospitals, long-term care facilities, and outpatient services. The main target group was patients, especially older users, followed by doctors and other medical staff members. Conclusions: The results show that smart speakers have diverse applications in health and social care, addressing different contexts and audiences. Their affordability and easy-to-use interfaces make them attractive to various stakeholders. It seems likely that, due to technical advances in artificial intelligence and the market power of the companies behind the devices, there will be more use cases for smart speakers in the near future. ", doi="10.2196/55673", url="https://ai.jmir.org/2025/1/e55673", url="http://www.ncbi.nlm.nih.gov/pubmed/39804689" } @Article{info:doi/10.2196/62669, author="Rjoop, Anwar and Al-Qudah, Mohammad and Alkhasawneh, Raja and Bataineh, Nesreen and Abdaljaleel, Maram and Rjoub, A. Moayad and Alkhateeb, Mustafa and Abdelraheem, Mohammad and Al-Omari, Salem and Bani-Mari, Omar and Alkabalan, Anas and Altulaih, Saoud and Rjoub, Iyad and Alshimi, Rula", title="Awareness and Attitude Toward Artificial Intelligence Among Medical Students and Pathology Trainees: Survey Study", journal="JMIR Med Educ", year="2025", month="Jan", day="10", volume="11", pages="e62669", keywords="artificial intelligence", keywords="AI", keywords="deep learning", keywords="medical schools", keywords="pathology", keywords="Jordan", keywords="medical education", keywords="awareness", keywords="attitude", keywords="medical students", keywords="pathology trainees", keywords="national survey study", keywords="medical practice", keywords="training", keywords="web-based survey", keywords="survey", keywords="questionnaire", abstract="Background: Artificial intelligence (AI) is set to shape the future of medical practice. The perspective and understanding of medical students are critical for guiding the development of educational curricula and training. Objective: This study aims to assess and compare medical AI-related attitudes among medical students in general medicine and in one of the visually oriented fields (pathology), along with illuminating their anticipated role of AI in the rapidly evolving landscape of AI-enhanced health care. Methods: This was a cross-sectional study that used a web-based survey composed of a closed-ended questionnaire. The survey addressed medical students at all educational levels across the 5 public medical schools, along with pathology residents in 4 residency programs in Jordan. Results: A total of 394 respondents participated (328 medical students and 66 pathology residents). The majority of respondents (272/394, 69\%) were already aware of AI and deep learning in medicine, mainly relying on websites for information on AI, while only 14\% (56/394) were aware of AI through medical schools. There was a statistically significant difference in awareness among respondents who consider themselves tech experts compared with those who do not (P=.03). More than half of the respondents believed that AI could be used to diagnose diseases automatically (213/394, 54.1\% agreement), with medical students agreeing more than pathology residents (P=.04). However, more than one-third expressed fear about recent AI developments (167/394, 42.4\% agreed). Two-thirds of respondents disagreed that their medical schools had educated them about AI and its potential use (261/394, 66.2\% disagreed), while 46.2\% (182/394) expressed interest in learning about AI in medicine. In terms of pathology-specific questions, 75.4\% (297/394) agreed that AI could be used to identify pathologies in slide examinations automatically. There was a significant difference between medical students and pathology residents in their agreement (P=.001). Overall, medical students and pathology trainees had similar responses. Conclusions: AI education should be introduced into medical school curricula to improve medical students' understanding and attitudes. Students agreed that they need to learn about AI's applications, potential hazards, and legal and ethical implications. This is the first study to analyze medical students' views and awareness of AI in Jordan, as well as the first to include pathology residents' perspectives. The findings are consistent with earlier research internationally. In comparison with prior research, these attitudes are similar in low-income and industrialized countries, highlighting the need for a global strategy to introduce AI instruction to medical students everywhere in this era of rapidly expanding technology. ", doi="10.2196/62669", url="https://mededu.jmir.org/2025/1/e62669" } @Article{info:doi/10.2196/67621, author="Chetla, Nitin and Tandon, Mihir and Chang, Joseph and Sukhija, Kunal and Patel, Romil and Sanchez, Ramon", title="Evaluating ChatGPT's Efficacy in Pediatric Pneumonia Detection From Chest X-Rays: Comparative Analysis of Specialized AI Models", journal="JMIR AI", year="2025", month="Jan", day="10", volume="4", pages="e67621", keywords="artificial intelligence", keywords="ChatGPT", keywords="pneumonia", keywords="chest x-ray", keywords="pediatric", keywords="radiology", keywords="large language models", keywords="machine learning", keywords="pneumonia detection", keywords="diagnosis", keywords="pediatric pneumonia", doi="10.2196/67621", url="https://ai.jmir.org/2025/1/e67621" } @Article{info:doi/10.2196/67256, author="Yang, Xiaomeng and Li, Zeyan and Lei, Lei and Shi, Xiaoyu and Zhang, Dingming and Zhou, Fei and Li, Wenjing and Xu, Tianyou and Liu, Xinyu and Wang, Songyun and Yuan, Quan and Yang, Jian and Wang, Xinyu and Zhong, Yanfei and Yu, Lilei", title="Noninvasive Oral Hyperspectral Imaging--Driven Digital Diagnosis of Heart Failure With Preserved Ejection Fraction: Model Development and Validation Study", journal="J Med Internet Res", year="2025", month="Jan", day="7", volume="27", pages="e67256", keywords="heart failure with preserved ejection fraction", keywords="HFpEF", keywords="hyperspectral imaging", keywords="HSI", keywords="diagnostic model", keywords="digital health", keywords="Shapley Additive Explanations", keywords="SHAP", keywords="machine learning", keywords="artificial intelligence", keywords="AI", keywords="cardiovascular disease", keywords="predictive modeling", keywords="oral health", abstract="Background: Oral microenvironmental disorders are associated with an increased risk of heart failure with preserved ejection fraction (HFpEF). Hyperspectral imaging (HSI) technology enables the detection of substances that are visually indistinguishable to the human eye, providing a noninvasive approach with extensive applications in medical diagnostics. Objective: The objective of this study is to develop and validate a digital, noninvasive oral diagnostic model for patients with HFpEF using HSI combined with various machine learning algorithms. Methods: Between April 2023 and August 2023, a total of 140 patients were recruited from Renmin Hospital of Wuhan University to serve as the training and internal testing groups for this study. Subsequently, from August 2024 to September 2024, an additional 35 patients were enrolled from Three Gorges University and Yichang Central People's Hospital to constitute the external testing group. After preprocessing to ensure image quality, spectral and textural features were extracted from the images. We extracted 25 spectral bands from each patient image and obtained 8 corresponding texture features to evaluate the performance of 28 machine learning algorithms for their ability to distinguish control participants from participants with HFpEF. The model demonstrating the optimal performance in both internal and external testing groups was selected to construct the HFpEF diagnostic model. Hyperspectral bands significant for identifying participants with HFpEF were identified for further interpretative analysis. The Shapley Additive Explanations (SHAP) model was used to provide analytical insights into feature importance. Results: Participants were divided into a training group (n=105), internal testing group (n=35), and external testing group (n=35), with consistent baseline characteristics across groups. Among the 28 algorithms tested, the random forest algorithm demonstrated superior performance with an area under the receiver operating characteristic curve (AUC) of 0.884 and an accuracy of 82.9\% in the internal testing group, as well as an AUC of 0.812 and an accuracy of 85.7\% in the external testing group. For model interpretation, we used the top 25 features identified by the random forest algorithm. The SHAP analysis revealed discernible distinctions between control participants and participants with HFpEF, thereby validating the diagnostic model's capacity to accurately identify participants with HFpEF. Conclusions: This noninvasive and efficient model facilitates the identification of individuals with HFpEF, thereby promoting early detection, diagnosis, and treatment. Our research presents a clinically advanced diagnostic framework for HFpEF, validated using independent data sets and demonstrating significant potential to enhance patient care. Trial Registration: China Clinical Trial Registry ChiCTR2300078855; https://www.chictr.org.cn/showproj.html?proj=207133 ", doi="10.2196/67256", url="https://www.jmir.org/2025/1/e67256" } @Article{info:doi/10.2196/64936, author="AlSerkal, Mohamed Yousif and Ibrahim, Mohamed Naseem and Alsereidi, Suhail Aisha and Ibrahim, Mubaraka and Kurakula, Sudheer and Naqvi, Ahsan Sadaf and Khan, Yasir and Oottumadathil, Preman Neema", title="Real-Time Analytics and AI for Managing No-Show Appointments in Primary Health Care in the United Arab Emirates: Before-and-After Study", journal="JMIR Form Res", year="2025", month="Jan", day="6", volume="9", pages="e64936", keywords="electronic health record", keywords="EHR", keywords="artificial intelligence", keywords="AI", keywords="no-show appointments", keywords="real-time data", keywords="primary health care", keywords="risk prediction", keywords="clinic waiting time", keywords="operational efficiency", abstract="Background: Primary health care (PHC) services face operational challenges due to high patient volumes, leading to complex management needs. Patients access services through booked appointments and walk-in visits, with walk-in visits often facing longer waiting times. No-show appointments are significant contributors to inefficiency in PHC operations, which can lead to an estimated 3\%-14\% revenue loss, disrupt resource allocation, and negatively impact health care quality. Emirates Health Services (EHS) PHC centers handle over 140,000 visits monthly. Baseline data indicate a 21\% no-show rate and an average patient wait time exceeding 16 minutes, necessitating an advanced scheduling and resource management system to enhance patient experiences and operational efficiency. Objective: The objective of this study was to evaluate the impact of an artificial intelligence (AI)-driven solution that was integrated with an interactive real-time data dashboard on reducing no-show appointments and improving patient waiting times at the EHS PHCs. Methods: This study introduced an innovative AI-based data application to enhance PHC efficiency. Leveraging our electronic health record system, we deployed an AI model with an 86\% accuracy rate to predict no-shows by analyzing historical data and categorizing appointments based on no-show risk. The model was integrated with a real-time dashboard to monitor patient journeys and wait times. Clinic coordinators used the dashboard to proactively manage high-risk appointments and optimize resource allocation. The intervention was assessed through a before-and-after comparison of PHC appointment dynamics and wait times, analyzing data from 135,393 appointments (67,429 before implementation and 67,964 after implementation). Results: Implementation of the AI-powered no-show prediction model resulted in a significant 50.7\% reduction in no-show rates (P<.001). The odds ratio for no-shows after implementation was 0.43 (95\% CI 0.42-0.45; P<.001), indicating a 57\% reduction in the likelihood of no-shows. Additionally, patient wait times decreased by an average of 5.7 minutes overall (P<.001), with some PHCs achieving up to a 50\% reduction in wait times. Conclusions: This project demonstrates that integrating AI with a data analytics platform and an electronic health record systems can significantly improve operational efficiency and patient satisfaction in PHC settings. The AI model enabled daily assessments of wait times and allowed for real-time adjustments, such as reallocating patients to different clinicians, thus reducing wait times and optimizing resource use. These findings illustrate the transformative potential of AI and real-time data analytics in health care delivery. ", doi="10.2196/64936", url="https://formative.jmir.org/2025/1/e64936" } @Article{info:doi/10.2196/52270, author="Bae, Won Sang and Chung, Tammy and Zhang, Tongze and Dey, K. Anind and Islam, Rahul", title="Enhancing Interpretable, Transparent, and Unobtrusive Detection of Acute Marijuana Intoxication in Natural Environments: Harnessing Smart Devices and Explainable AI to Empower Just-In-Time Adaptive Interventions: Longitudinal Observational Study", journal="JMIR AI", year="2025", month="Jan", day="2", volume="4", pages="e52270", keywords="digital phenotyping", keywords="smart devices", keywords="intoxication", keywords="smartphone-based sensors", keywords="wearables", keywords="mHealth", keywords="marijuana", keywords="cannabis", keywords="data collection", keywords="passive sensing", keywords="Fitbit", keywords="machine learning", keywords="eXtreme Gradient Boosting Machine classifier", keywords="XGBoost", keywords="algorithmic decision-making process", keywords="explainable artificial intelligence", keywords="XAI", keywords="artificial intelligence", keywords="JITAI", keywords="decision support", keywords="just-in-time adaptive interventions", keywords="experience sampling", abstract="Background: Acute marijuana intoxication can impair motor skills and cognitive functions such as attention and information processing. However, traditional tests, like blood, urine, and saliva, fail to accurately detect acute marijuana intoxication in real time. Objective: This study aims to explore whether integrating smartphone-based sensors with readily accessible wearable activity trackers, like Fitbit, can enhance the detection of acute marijuana intoxication in naturalistic settings. No previous research has investigated the effectiveness of passive sensing technologies for enhancing algorithm accuracy or enhancing the interpretability of digital phenotyping through explainable artificial intelligence in real-life scenarios. This approach aims to provide insights into how individuals interact with digital devices during algorithmic decision-making, particularly for detecting moderate to intensive marijuana intoxication in real-world contexts. Methods: Sensor data from smartphones and Fitbits, along with self-reported marijuana use, were collected from 33 young adults over a 30-day period using the experience sampling method. Participants rated their level of intoxication on a scale from 1 to 10 within 15 minutes of consuming marijuana and during 3 daily semirandom prompts. The ratings were categorized as not intoxicated (0), low (1-3), and moderate to intense intoxication (4-10). The study analyzed the performance of models using mobile phone data only, Fitbit data only, and a combination of both (MobiFit) in detecting acute marijuana intoxication. Results: The eXtreme Gradient Boosting Machine classifier showed that the MobiFit model, which combines mobile phone and wearable device data, achieved 99\% accuracy (area under the curve=0.99; F1-score=0.85) in detecting acute marijuana intoxication in natural environments. The F1-score indicated significant improvements in sensitivity and specificity for the combined MobiFit model compared to using mobile or Fitbit data alone. Explainable artificial intelligence revealed that moderate to intense self-reported marijuana intoxication was associated with specific smartphone and Fitbit metrics, including elevated minimum heart rate, reduced macromovement, and increased noise energy around participants. Conclusions: This study demonstrates the potential of using smartphone sensors and wearable devices for interpretable, transparent, and unobtrusive monitoring of acute marijuana intoxication in daily life. Advanced algorithmic decision-making provides valuable insight into behavioral, physiological, and environmental factors that could support timely interventions to reduce marijuana-related harm. Future real-world applications of these algorithms should be evaluated in collaboration with clinical experts to enhance their practicality and effectiveness. ", doi="10.2196/52270", url="https://ai.jmir.org/2025/1/e52270" } @Article{info:doi/10.2196/57271, author="Alqahtani, M. Mohammed and Alanazi, M. Abdullah M. and Algarni, S. Saleh and Aljohani, Hassan and Alenezi, K. Faraj and F Alotaibi, Tareq and Alotaibi, Mansour and K Alqahtani, Mobarak and Alahmari, Mushabbab and S Alwadeai, Khalid and M Alghamdi, Saeed and Almeshari, A. Mohammed and Alshammari, Faleh Turki and Mumenah, Noora and Al Harbi, Ebtihal and Al Nufaiei, F. Ziyad and Alhuthail, Eyas and Alzahrani, Esam and Alahmadi, Husam and Alarifi, Abdulaziz and Zaidan, Amal and T Ismaeil, Taha", title="Unveiling the Influence of AI on Advancements in Respiratory Care: Narrative Review", journal="Interact J Med Res", year="2024", month="Dec", day="20", volume="13", pages="e57271", keywords="artificial intelligence", keywords="AI", keywords="respiratory care", keywords="machine learning", keywords="digital health", keywords="narrative review", abstract="Background: Artificial intelligence is experiencing rapid growth, with continual innovation and advancements in the health care field. Objective: This study aims to evaluate the application of artificial intelligence technologies across various domains of respiratory care. Methods: We conducted a narrative review to examine the latest advancements in the use of artificial intelligence in the field of respiratory care. The search was independently conducted by respiratory care experts, each focusing on their respective scope of practice and area of interest. Results: This review illuminates the diverse applications of artificial intelligence, highlighting its use in areas associated with respiratory care. Artificial intelligence is harnessed across various areas in this field, including pulmonary diagnostics, respiratory care research, critical care or mechanical ventilation, pulmonary rehabilitation, telehealth, public health or health promotion, sleep clinics, home care, smoking or vaping behavior, and neonates and pediatrics. With its multifaceted utility, artificial intelligence can enhance the field of respiratory care, potentially leading to superior health outcomes for individuals under this extensive umbrella. Conclusions: As artificial intelligence advances, elevating academic standards in the respiratory care profession becomes imperative, allowing practitioners to contribute to research and understand artificial intelligence's impact on respiratory care. The permanent integration of artificial intelligence into respiratory care creates the need for respiratory therapists to positively influence its progression. By participating in artificial intelligence development, respiratory therapists can augment their clinical capabilities, knowledge, and patient outcomes. ", doi="10.2196/57271", url="https://www.i-jmr.org/2024/1/e57271", url="http://www.ncbi.nlm.nih.gov/pubmed/39705080" } @Article{info:doi/10.2196/56682, author="Wagh, Vaidehi and Scott, W. Matthew and Kraeutner, N. Sarah", title="Quantifying Similarities Between MediaPipe and a Known Standard to Address Issues in Tracking 2D Upper Limb Trajectories: Proof of Concept Study", journal="JMIR Form Res", year="2024", month="Dec", day="17", volume="8", pages="e56682", keywords="markerless pose estimation", keywords="procrustes analysis", keywords="artificial intelligence", keywords="motion", keywords="movement tracking", keywords="touchscreen", keywords="markerless tracking", keywords="upper limb", keywords="motor", abstract="Background: Markerless motion tracking methods have promise for use in a range of domains, including clinical settings where traditional marker-based systems for human pose estimation are not feasible. Artificial intelligence (AI)--based systems can offer a markerless, lightweight approach to motion capture. However, the accuracy of such systems, such as MediaPipe, for tracking fine upper limb movements involving the hand has not been explored. Objective: The aim of this study is to evaluate the 2D accuracy of MediaPipe against a known standard. Methods: Participants (N=10) performed a touchscreen-based shape-tracing task requiring them to trace the trajectory of a moving cursor using their index finger. Cursor trajectories created a reoccurring or random shape at 5 different speeds (500-2500 ms, in increments of 500 ms). Movement trajectories on each trial were simultaneously captured by the touchscreen and a separate video camera. Movement coordinates for each trial were extracted from the touchscreen and compared to those predicted by MediaPipe. Specifically, following resampling, normalization, and Procrustes transformations, root-mean-squared error (RMSE; primary outcome measure) was calculated between predicted coordinates and those generated by the touchscreen computer. Results: Although there was some size distortion in the frame-by-frame estimates predicted by MediaPipe, shapes were similar between the 2 methods and transformations improved the general overlap and similarity of the shapes. The resultant mean RMSE between predicted coordinates and those generated by the touchscreen was 0.28 (SD 0.06) normalized px. Equivalence testing revealed that accuracy differed between MediaPipe and the touchscreen, but that the true difference was between 0 and 0.30 normalized px (t114=?3.02; P=.002). Additional analyses revealed no differences in resultant RMSE between methods when comparing across lower frame rates (30 and 60 frames per second [FPS]), although there was greater RMSE for 120 FPS than for 60 FPS (t35.43=?2.51; P=.03). Conclusions: Overall, we quantified similarities between one AI-based approach to motion capture and a known standard for tracking fine upper limb movements, informing applications of such systems in domains such as clinical and research settings. Future work should address accuracy in 3 dimensions to further validate the use of AI-based systems, including MediaPipe, in such domains. ", doi="10.2196/56682", url="https://formative.jmir.org/2024/1/e56682" } @Article{info:doi/10.2196/64362, author="Dahu, M. Butros and Khan, Solaiman and Toubal, Eddine Imad and Alshehri, Mariam and Martinez-Villar, I. Carlos and Ogundele, B. Olabode and Sheets, R. Lincoln and Scott, J. Grant", title="Geospatial Modeling of Deep Neural Visual Features for Predicting Obesity Prevalence in Missouri: Quantitative Study", journal="JMIR AI", year="2024", month="Dec", day="17", volume="3", pages="e64362", keywords="geospatial modeling", keywords="deep convolutional neural network", keywords="DCNN", keywords="Residual Network-50", keywords="ResNet-50", keywords="satellite imagery", keywords="Moran I", keywords="local indicators of spatial association", keywords="LISA", keywords="spatial lag model", keywords="obesity rate", keywords="artificial intelligence", keywords="AI", abstract="Background: The global obesity epidemic demands innovative approaches to understand its complex environmental and social determinants. Spatial technologies, such as geographic information systems, remote sensing, and spatial machine learning, offer new insights into this health issue. This study uses deep learning and spatial modeling to predict obesity rates for census tracts in Missouri. Objective: This study aims to develop a scalable method for predicting obesity prevalence using deep convolutional neural networks applied to satellite imagery and geospatial analysis, focusing on 1052 census tracts in Missouri. Methods: Our analysis followed 3 steps. First, Sentinel-2 satellite images were processed using the Residual Network-50 model to extract environmental features from 63,592 image chips (224{\texttimes}224 pixels). Second, these features were merged with obesity rate data from the Centers for Disease Control and Prevention for Missouri census tracts. Third, a spatial lag model was used to predict obesity rates and analyze the association between deep neural visual features and obesity prevalence. Spatial autocorrelation was used to identify clusters of obesity rates. Results: Substantial spatial clustering of obesity rates was found across Missouri, with a Moran I value of 0.68, indicating similar obesity rates among neighboring census tracts. The spatial lag model demonstrated strong predictive performance, with an R2 of 0.93 and a spatial pseudo R2 of 0.92, explaining 93\% of the variation in obesity rates. Local indicators from a spatial association analysis revealed regions with distinct high and low clusters of obesity, which were visualized through choropleth maps. Conclusions: This study highlights the effectiveness of integrating deep convolutional neural networks and spatial modeling to predict obesity prevalence based on environmental features from satellite imagery. The model's high accuracy and ability to capture spatial patterns offer valuable insights for public health interventions. Future work should expand the geographical scope and include socioeconomic data to further refine the model for broader applications in obesity research. ", doi="10.2196/64362", url="https://ai.jmir.org/2024/1/e64362", url="http://www.ncbi.nlm.nih.gov/pubmed/39688897" } @Article{info:doi/10.2196/56863, author="Wiwatthanasetthakarn, Phongphat and Ponthongmak, Wanchana and Looareesuwan, Panu and Tansawet, Amarit and Numthavaj, Pawin and McKay, J. Gareth and Attia, John and Thakkinstian, Ammarin", title="Development and Validation of a Literature Screening Tool: Few-Shot Learning Approach in Systematic Reviews", journal="J Med Internet Res", year="2024", month="Dec", day="11", volume="26", pages="e56863", keywords="few shots learning", keywords="deep learning", keywords="natural language processing", keywords="S-BERT", keywords="systematic review", keywords="study selection", keywords="sentence-bidirectional encoder representations from transformers", abstract="Background: Systematic reviews (SRs) are considered the highest level of evidence, but their rigorous literature screening process can be time-consuming and resource-intensive. This is particularly challenging given the rapid pace of medical advancements, which can quickly make SRs outdated. Few-shot learning (FSL), a machine learning approach that learns effectively from limited data, offers a potential solution to streamline this process. Sentence-bidirectional encoder representations from transformers (S-BERT) are particularly promising for identifying relevant studies with fewer examples. Objective: This study aimed to develop a model framework using FSL to efficiently screen and select relevant studies for inclusion in SRs, aiming to reduce workload while maintaining high recall rates. Methods: We developed and validated the FSL model framework using 9 previously published SR projects (2016-2018). The framework used S-BERT with titles and abstracts as input data. Key evaluation metrics, including workload reduction, cosine similarity score, and the number needed to screen at 100\% recall, were estimated to determine the optimal number of eligible studies for model training. A prospective evaluation phase involving 4 ongoing SRs was then conducted. Study selection by FSL and a secondary reviewer were compared with the principal reviewer (considered the gold standard) to estimate the false negative rate. Results: Model development suggested an optimal range of 4-12 eligible studies for FSL training. Using 4-6 eligible studies during model development resulted in similarity thresholds for 100\% recall, ranging from 0.432 to 0.636, corresponding to a workload reduction of 51.11\% (95\% CI 46.36-55.86) to 97.67\% (95\% CI 96.76-98.58). The prospective evaluation of 4 SRs aimed for a 50\% workload reduction, yielding numbers needed to screen 497 to 1035 out of 995 to 2070 studies. The false negative rate ranged from 1.87\% to 12.20\% for the FSL model and from 5\% to 56.48\% for the second reviewer compared with the principal reviewer. Conclusions: Our FSL framework demonstrates the potential for reducing workload in SR screening by over 50\%. However, the model did not achieve 100\% recall at this threshold, highlighting the potential for omitting eligible studies. Future work should focus on developing a web application to implement the FSL framework, making it accessible to researchers. ", doi="10.2196/56863", url="https://www.jmir.org/2024/1/e56863", url="http://www.ncbi.nlm.nih.gov/pubmed/39662894" } @Article{info:doi/10.2196/52597, author="Sorin, Vera and Brin, Dana and Barash, Yiftach and Konen, Eli and Charney, Alexander and Nadkarni, Girish and Klang, Eyal", title="Large Language Models and Empathy: Systematic Review", journal="J Med Internet Res", year="2024", month="Dec", day="11", volume="26", pages="e52597", keywords="empathy", keywords="LLMs", keywords="AI", keywords="ChatGPT", keywords="review methods", keywords="review methodology", keywords="systematic review", keywords="scoping", keywords="synthesis", keywords="foundation models", keywords="text-based", keywords="human interaction", keywords="emotional intelligence", keywords="objective metrics", keywords="human assessment", keywords="emotions", keywords="healthcare", keywords="cognitive", keywords="PRISMA", abstract="Background: Empathy, a fundamental aspect of human interaction, is characterized as the ability to experience another being's emotions within oneself. In health care, empathy is a fundamental for health care professionals and patients' interaction. It is a unique quality to humans that large language models (LLMs) are believed to lack. Objective: We aimed to review the literature on the capacity of LLMs in demonstrating empathy. Methods: We conducted a literature search on MEDLINE, Google Scholar, PsyArXiv, medRxiv, and arXiv between December 2022 and February 2024. We included English-language full-length publications that evaluated empathy in LLMs' outputs. We excluded papers evaluating other topics related to emotional intelligence that were not specifically empathy. The included studies' results, including the LLMs used, performance in empathy tasks, and limitations of the models, along with studies' metadata were summarized. Results: A total of 12 studies published in 2023 met the inclusion criteria. ChatGPT-3.5 (OpenAI) was evaluated in all studies, with 6 studies comparing it with other LLMs such GPT-4, LLaMA (Meta), and fine-tuned chatbots. Seven studies focused on empathy within a medical context. The studies reported LLMs to exhibit elements of empathy, including emotions recognition and emotional support in diverse contexts. Evaluation metric included automatic metrics such as Recall-Oriented Understudy for Gisting Evaluation and Bilingual Evaluation Understudy, and human subjective evaluation. Some studies compared performance on empathy with humans, while others compared between different models. In some cases, LLMs were observed to outperform humans in empathy-related tasks. For example, ChatGPT-3.5 was evaluated for its responses to patients' questions from social media, where ChatGPT's responses were preferred over those of humans in 78.6\% of cases. Other studies used subjective readers' assigned scores. One study reported a mean empathy score of 1.84-1.9 (scale 0-2) for their fine-tuned LLM, while a different study evaluating ChatGPT-based chatbots reported a mean human rating of 3.43 out of 4 for empathetic responses. Other evaluations were based on the level of the emotional awareness scale, which was reported to be higher for ChatGPT-3.5 than for humans. Another study evaluated ChatGPT and GPT-4 on soft-skills questions in the United States Medical Licensing Examination, where GPT-4 answered 90\% of questions correctly. Limitations were noted, including repetitive use of empathic phrases, difficulty following initial instructions, overly lengthy responses, sensitivity to prompts, and overall subjective evaluation metrics influenced by the evaluator's background. Conclusions: LLMs exhibit elements of cognitive empathy, recognizing emotions and providing emotionally supportive responses in various contexts. Since social skills are an integral part of intelligence, these advancements bring LLMs closer to human-like interactions and expand their potential use in applications requiring emotional intelligence. However, there remains room for improvement in both the performance of these models and the evaluation strategies used for assessing soft skills. ", doi="10.2196/52597", url="https://www.jmir.org/2024/1/e52597" } @Article{info:doi/10.2196/63892, author="Cho, Seungbeom and Lee, Mangyeong and Yu, Jaewook and Yoon, Junghee and Choi, Jae-Boong and Jung, Kyu-Hwan and Cho, Juhee", title="Leveraging Large Language Models for Improved Understanding of Communications With Patients With Cancer in a Call Center Setting: Proof-of-Concept Study", journal="J Med Internet Res", year="2024", month="Dec", day="11", volume="26", pages="e63892", keywords="large language model", keywords="cancer", keywords="supportive care", keywords="LLMs", keywords="patient communication", keywords="natural language processing", keywords="NLP", keywords="self-management", keywords="teleconsultation", keywords="triage services", keywords="telephone consultations", abstract="Background: Hospital call centers play a critical role in providing support and information to patients with cancer, making it crucial to effectively identify and understand patient intent during consultations. However, operational efficiency and standardization of telephone consultations, particularly when categorizing diverse patient inquiries, remain significant challenges. While traditional deep learning models like long short-term memory (LSTM) and bidirectional encoder representations from transformers (BERT) have been used to address these issues, they heavily depend on annotated datasets, which are labor-intensive and time-consuming to generate. Large language models (LLMs) like GPT-4, with their in-context learning capabilities, offer a promising alternative for classifying patient intent without requiring extensive retraining. Objective: This study evaluates the performance of GPT-4 in classifying the purpose of telephone consultations of patients with cancer. In addition, it compares the performance of GPT-4 to that of discriminative models, such as LSTM and BERT, with a particular focus on their ability to manage ambiguous and complex queries. Methods: We used a dataset of 430,355 sentences from telephone consultations with patients with cancer between 2016 and 2020. LSTM and BERT models were trained on 300,000 sentences using supervised learning, while GPT-4 was applied using zero-shot and few-shot approaches without explicit retraining. The accuracy of each model was compared using 1,000 randomly selected sentences from 2020 onward, with special attention paid to how each model handled ambiguous or uncertain queries. Results: GPT-4, which uses only a few examples (a few shots), attained a remarkable accuracy of 85.2\%, considerably outperforming the LSTM and BERT models, which achieved accuracies of 73.7\% and 71.3\%, respectively. Notably, categories such as ``Treatment,'' ``Rescheduling,'' and ``Symptoms'' involve multiple contexts and exhibit significant complexity. GPT-4 demonstrated more than 15\% superior performance in handling ambiguous queries in these categories. In addition, GPT-4 excelled in categories like ``Records'' and ``Routine,'' where contextual clues were clear, outperforming the discriminative models. These findings emphasize the potential of LLMs, particularly GPT-4, for interpreting complicated patient interactions during cancer-related telephone consultations. Conclusions: This study shows the potential of GPT-4 to significantly improve the classification of patient intent in cancer-related telephone oncological consultations. GPT-4's ability to handle complex and ambiguous queries without extensive retraining provides a substantial advantage over discriminative models like LSTM and BERT. While GPT-4 demonstrates strong performance in various areas, further refinement of prompt design and category definitions is necessary to fully leverage its capabilities in practical health care applications. Future research will explore the integration of LLMs like GPT-4 into hybrid systems that combine human oversight with artificial intelligence--driven technologies. ", doi="10.2196/63892", url="https://www.jmir.org/2024/1/e63892" } @Article{info:doi/10.2196/55986, author="Zou, Zhuan and Chen, Bin and Xiao, Dongqiong and Tang, Fajuan and Li, Xihong", title="Accuracy of Machine Learning in Detecting Pediatric Epileptic Seizures: Systematic Review and Meta-Analysis", journal="J Med Internet Res", year="2024", month="Dec", day="11", volume="26", pages="e55986", keywords="epileptic seizures", keywords="machine learning", keywords="deep learning", keywords="electroencephalogram", keywords="EEG", keywords="children", keywords="pediatrics", keywords="epilepsy", keywords="detection", abstract="Background: Real-time monitoring of pediatric epileptic seizures poses a significant challenge in clinical practice. In recent years, machine learning (ML) has attracted substantial attention from researchers for diagnosing and treating neurological diseases, leading to its application for detecting pediatric epileptic seizures. However, systematic evidence substantiating its feasibility remains limited. Objective: This systematic review aimed to consolidate the existing evidence regarding the effectiveness of ML in monitoring pediatric epileptic seizures with an effort to provide an evidence-based foundation for the development and enhancement of intelligent tools in the future. Methods: We conducted a systematic search of the PubMed, Cochrane, Embase, and Web of Science databases for original studies focused on the detection of pediatric epileptic seizures using ML, with a cutoff date of August 27, 2023. The risk of bias in eligible studies was assessed using the QUADAS-2 (Quality Assessment of Diagnostic Accuracy Studies--2). Meta-analyses were performed to evaluate the C-index and the diagnostic 4-grid table, using a bivariate mixed-effects model for the latter. We also examined publication bias for the C-index by using funnel plots and the Egger test. Results: This systematic review included 28 original studies, with 15 studies on ML and 13 on deep learning (DL). All these models were based on electroencephalography data of children. The pooled C-index, sensitivity, specificity, and accuracy of ML in the training set were 0.76 (95\% CI 0.69-0.82), 0.77 (95\% CI 0.73-0.80), 0.74 (95\% CI 0.70-0.77), and 0.75 (95\% CI 0.72-0.77), respectively. In the validation set, the pooled C-index, sensitivity, specificity, and accuracy of ML were 0.73 (95\% CI 0.67-0.79), 0.88 (95\% CI 0.83-0.91), 0.83 (95\% CI 0.71-0.90), and 0.78 (95\% CI 0.73-0.82), respectively. Meanwhile, the pooled C-index of DL in the validation set was 0.91 (95\% CI 0.88-0.94), with sensitivity, specificity, and accuracy being 0.89 (95\% CI 0.85-0.91), 0.91 (95\% CI 0.88-0.93), and 0.89 (95\% CI 0.86-0.92), respectively. Conclusions: Our systematic review demonstrates promising accuracy of artificial intelligence methods in epilepsy detection. DL appears to offer higher detection accuracy than ML. These findings support the development of DL-based early-warning tools in future research. Trial Registration: PROSPERO CRD42023467260; https://www.crd.york.ac.uk/prospero/display\_record.php?ID=CRD42023467260 ", doi="10.2196/55986", url="https://www.jmir.org/2024/1/e55986" } @Article{info:doi/10.2196/60063, author="Chen, Xiaolan and Zhao, Ziwei and Zhang, Weiyi and Xu, Pusheng and Wu, Yue and Xu, Mingpu and Gao, Le and Li, Yinwen and Shang, Xianwen and Shi, Danli and He, Mingguang", title="EyeGPT for Patient Inquiries and Medical Education: Development and Validation of an Ophthalmology Large Language Model", journal="J Med Internet Res", year="2024", month="Dec", day="11", volume="26", pages="e60063", keywords="large language model", keywords="generative pretrained transformer", keywords="generative artificial intelligence", keywords="ophthalmology", keywords="retrieval-augmented generation", keywords="medical assistant", keywords="EyeGPT", keywords="generative AI", abstract="Background: Large language models (LLMs) have the potential to enhance clinical flow and improve medical education, but they encounter challenges related to specialized knowledge in ophthalmology. Objective: This study aims to enhance ophthalmic knowledge by refining a general LLM into an ophthalmology-specialized assistant for patient inquiries and medical education. Methods: We transformed Llama2 into an ophthalmology-specialized LLM, termed EyeGPT, through the following 3 strategies: prompt engineering for role-playing, fine-tuning with publicly available data sets filtered for eye-specific terminology (83,919 samples), and retrieval-augmented generation leveraging a medical database and 14 ophthalmology textbooks. The efficacy of various EyeGPT variants was evaluated by 4 board-certified ophthalmologists through comprehensive use of 120 diverse category questions in both simple and complex question-answering scenarios. The performance of the best EyeGPT model was then compared with that of the unassisted human physician group and the EyeGPT+human group. We proposed 4 metrics for assessment: accuracy, understandability, trustworthiness, and empathy. The proportion of hallucinations was also reported. Results: The best fine-tuned model significantly outperformed the original Llama2 model at providing informed advice (mean 9.30, SD 4.42 vs mean 13.79, SD 5.70; P<.001) and mitigating hallucinations (97/120, 80.8\% vs 53/120, 44.2\%, P<.001). Incorporating information retrieval from reliable sources, particularly ophthalmology textbooks, further improved the model's response compared with solely the best fine-tuned model (mean 13.08, SD 5.43 vs mean 15.14, SD 4.64; P=.001) and reduced hallucinations (71/120, 59.2\% vs 57/120, 47.4\%, P=.02). Subgroup analysis revealed that EyeGPT showed robustness across common diseases, with consistent performance across different users and domains. Among the variants, the model integrating fine-tuning and book retrieval ranked highest, closely followed by the combination of fine-tuning and the manual database, standalone fine-tuning, and pure role-playing methods. EyeGPT demonstrated competitive capabilities in understandability and empathy when compared with human ophthalmologists. With the assistance of EyeGPT, the performance of the ophthalmologist was notably enhanced. Conclusions: We pioneered and introduced EyeGPT by refining a general domain LLM and conducted a comprehensive comparison and evaluation of different strategies to develop an ophthalmology-specific assistant. Our results highlight EyeGPT's potential to assist ophthalmologists and patients in medical settings. ", doi="10.2196/60063", url="https://www.jmir.org/2024/1/e60063", url="http://www.ncbi.nlm.nih.gov/pubmed/39661433" } @Article{info:doi/10.2196/60650, author="Bosco, Cristina and Shojaei, Fereshtehossadat and Theisz, Andrew Alec and Osorio Torres, John and Cureton, Bianca and Himes, K. Anna and Jessup, M. Nenette and Barnes, A. Priscilla and Lu, Yvonne and Hendrie, C. Hugh and Hill, V. Carl and Shih, C. Patrick", title="Testing 3 Modalities (Voice Assistant, Chatbot, and Mobile App) to Assist Older African American and Black Adults in Seeking Information on Alzheimer Disease and Related Dementias: Wizard of Oz Usability Study", journal="JMIR Form Res", year="2024", month="Dec", day="9", volume="8", pages="e60650", keywords="older African American and Black adults", keywords="Alzheimer disease and related dementias", keywords="health literacy", keywords="Wizard of Oz", keywords="voice assistant", keywords="chatbot", keywords="mobile app", keywords="dementia", keywords="geriatric", keywords="aging", keywords="Alzheimer disease", keywords="artificial intelligence", keywords="AI", keywords="mHealth", keywords="digital tools", abstract="Background: Older African American and Black adults are twice as likely to develop Alzheimer disease and related dementias (ADRD) and have the lowest level of ADRD health literacy compared to any other ethnic group in the United States. Low health literacy concerning ADRD negatively impacts African American and Black people in accessing adequate health care. Objective: This study explored how 3 technological modalities---voice assistants, chatbots, and mobile apps---can assist older African American and Black adults in accessing ADRD information to improve ADRD health literacy. By testing each modality independently, the focus could be kept on understanding the unique needs and challenges of this population concerning the use of each modality when accessing ADRD-related information. Methods: Using the Wizard of Oz usability testing method, we assessed the 3 modalities with a sample of 15 older African American and Black adults aged >55 years. The 15 participants were asked to interact with the 3 modalities to search for information on local events happening in their geographical area and search for ADRD-related health information. Results: Our findings revealed that, across the 3 modalities, the content should avoid convoluted and complex language and give the possibility to save, store, and share it to be fully accessible by this population. In addition, content should come from credible sources, including information tailored to the participants' cultural values, as it has to be culturally relevant for African American and Black communities. Finally, the interaction with the tool must be time efficient, and it should be adapted to the user's needs to foster a sense of control and representation. Conclusions: We conclude that, when designing ADRD-related interventions for African American and Black older adults, it proves to be crucial to tailor the content provided by the technology to the community's values and construct an interaction with the technology that is built on African American and Black communities' needs and demands. ", doi="10.2196/60650", url="https://formative.jmir.org/2024/1/e60650" } @Article{info:doi/10.2196/55833, author="Gupta, Vikash and Erdal, Barbaros and Ramirez, Carolina and Floca, Ralf and Genereaux, Bradley and Bryson, Sidney and Bridge, Christopher and Kleesiek, Jens and Nensa, Felix and Braren, Rickmer and Younis, Khaled and Penzkofer, Tobias and Bucher, Michael Andreas and Qin, Melvin Ming and Bae, Gigon and Lee, Hyeonhoon and Cardoso, Jorge M. and Ourselin, Sebastien and Kerfoot, Eric and Choudhury, Rahul and White, D. Richard and Cook, Tessa and Bericat, David and Lungren, Matthew and Haukioja, Risto and Shuaib, Haris", title="Current State of Community-Driven Radiological AI Deployment in Medical Imaging", journal="JMIR AI", year="2024", month="Dec", day="9", volume="3", pages="e55833", keywords="radiology", keywords="open-source", keywords="radiology in practice", keywords="deep learning", keywords="artificial intelligence", keywords="imaging informatics", keywords="clinical deployment", keywords="imaging", keywords="medical informatics", keywords="workflow", keywords="operation", keywords="implementation", keywords="adoption", keywords="taxonomy", keywords="use case", keywords="model", keywords="integration", keywords="machine learning", keywords="mobile phone", doi="10.2196/55833", url="https://ai.jmir.org/2024/1/e55833" } @Article{info:doi/10.2196/60851, author="Varghese, Julian and Schuster, Alexander and Poschkamp, Broder and Yildirim, Kemal and Oehm, Johannes and Berens, Philipp and M{\"u}ller, Sarah and Gervelmeyer, Julius and Koch, Lisa and Hoffmann, Katja and Sedlmayr, Martin and Kakkassery, Vinodh and Kohlbacher, Oliver and Merle, David and Bartz-Schmidt, Ulrich Karl and Ueffing, Marius and Stahl, Dana and Leddig, Torsten and Bialke, Martin and Hampf, Christopher and Hoffmann, Wolfgang and Berthe, Sebastian and Waltemath, Dagmar and Walter, Peter and Lipprandt, Myriam and R{\"o}hrig, Rainer and Storp, Julian Jens and Zimmermann, Alexander Julian and Holtrup, Lea and Brix, Tobias and Stahl, Andreas and Eter, Nicole", title="EyeMatics: An Ophthalmology Use Case Within the German Medical Informatics Initiative", journal="JMIR Med Inform", year="2024", month="Dec", day="5", volume="12", pages="e60851", keywords="digital ophthalmology", keywords="interoperability", keywords="precision ophthalmology", keywords="patient engagement", keywords="Germany", keywords="clinical use", keywords="intravitreal", keywords="injections", keywords="eye", keywords="treatment", keywords="patient data", keywords="framework", keywords="AI", keywords="artificial intelligence", keywords="biomarker", keywords="retinal", keywords="scan", keywords="user-centered", keywords="observational", doi="10.2196/60851", url="https://medinform.jmir.org/2024/1/e60851" } @Article{info:doi/10.2196/63038, author="Shojaei, Fereshtehossadat and Shojaei, Fatemehalsadat and Osorio Torres, John and Shih, C. Patrick", title="Insights From Art Therapists on Using AI-Generated Art in Art Therapy: Mixed Methods Study", journal="JMIR Form Res", year="2024", month="Dec", day="4", volume="8", pages="e63038", keywords="art therapy", keywords="artificial intelligence", keywords="AI", keywords="therapeutic interventions", keywords="assistive AI", keywords="engagement", keywords="health care", keywords="therapy", keywords="art", keywords="therapists' insights", keywords="daily life", keywords="practitioner", keywords="assistive", keywords="AI-generated image", keywords="accessibility", keywords="therapy sessions", keywords="AI-generated tool", abstract="Background: With the increasing integration of artificial intelligence (AI) into various aspects of daily life, there is a growing interest among designers and practitioners in incorporating AI into their fields. In health care domains like art therapy, AI is also becoming a subject of exploration. However, the use of AI in art therapy is still undergoing investigation, with its benefits and challenges being actively explored. Objective: This study aims to investigate the integration of AI into art therapy practices to comprehend its potential impact on therapeutic processes and outcomes. Specifically, the focus is on understanding the perspectives of art therapists regarding the use of AI-assisted tools in their practice with clients, as demonstrated through the presentation of our prototype consisting of a deck of cards with words covering various categories alongside an AI-generated image. Methods: Using a co-design approach, 10 art therapists affiliated with the American Art Therapy Association participated in this study. They engaged in individual interviews where they discussed their professional perspectives on integrating AI into their therapeutic approaches and evaluating the prototype. Qualitative analysis was conducted to derive themes and insights from these sessions. Results: The study began in August 2023, with data collection involving 10 participants taking place in October 2023. Our qualitative findings provide a comprehensive evaluation of the impact of AI on facilitating therapeutic processes. The combination of a deck of cards and the use of an AI-generated tool demonstrated an enhancement in the quality and accessibility of therapy sessions. However, challenges such as credibility and privacy concerns were also identified. Conclusions: The integration of AI into art therapy presents promising avenues for innovation and progress within the field. By gaining insights into the perspectives and experiences of art therapists, this study contributes knowledge for both practical application and further research. ", doi="10.2196/63038", url="https://formative.jmir.org/2024/1/e63038" } @Article{info:doi/10.2196/57451, author="Jin, Kyung Hye and Kim, EunYoung", title="Performance of GPT-3.5 and GPT-4 on the Korean Pharmacist Licensing Examination: Comparison Study", journal="JMIR Med Educ", year="2024", month="Dec", day="4", volume="10", pages="e57451", keywords="GPT-3.5", keywords="GPT-4", keywords="Korean", keywords="Korean Pharmacist Licensing Examination", keywords="KPLE", abstract="Background: ChatGPT, a recently developed artificial intelligence chatbot and a notable large language model, has demonstrated improved performance on medical field examinations. However, there is currently little research on its efficacy in languages other than English or in pharmacy-related examinations. Objective: This study aimed to evaluate the performance of GPT models on the Korean Pharmacist Licensing Examination (KPLE). Methods: We evaluated the percentage of correct answers provided by 2 different versions of ChatGPT (GPT-3.5 and GPT-4) for all multiple-choice single-answer KPLE questions, excluding image-based questions. In total, 320, 317, and 323 questions from the 2021, 2022, and 2023 KPLEs, respectively, were included in the final analysis, which consisted of 4 units: Biopharmacy, Industrial Pharmacy, Clinical and Practical Pharmacy, and Medical Health Legislation. Results: The 3-year average percentage of correct answers was 86.5\% (830/960) for GPT-4 and 60.7\% (583/960) for GPT-3.5. GPT model accuracy was highest in Biopharmacy (GPT-3.5 77/96, 80.2\% in 2022; GPT-4 87/90, 96.7\% in 2021) and lowest in Medical Health Legislation (GPT-3.5 8/20, 40\% in 2022; GPT-4 12/20, 60\% in 2022). Additionally, when comparing the performance of artificial intelligence with that of human participants, pharmacy students outperformed GPT-3.5 but not GPT-4. Conclusions: In the last 3 years, GPT models have performed very close to or exceeded the passing threshold for the KPLE. This study demonstrates the potential of large language models in the pharmacy domain; however, extensive research is needed to evaluate their reliability and ensure their secure application in pharmacy contexts due to several inherent challenges. Addressing these limitations could make GPT models more effective auxiliary tools for pharmacy education. ", doi="10.2196/57451", url="https://mededu.jmir.org/2024/1/e57451" } @Article{info:doi/10.2196/63262, author="Brandl, Lena and Jansen-Kosterink, Stephanie and Brodbeck, Jeannette and Jacinto, Sofia and Mooser, Bettina and Heylen, Dirk", title="Moving Toward Meaningful Evaluations of Monitoring in e-Mental Health Based on the Case of a Web-Based Grief Service for Older Mourners: Mixed Methods Study", journal="JMIR Form Res", year="2024", month="Nov", day="28", volume="8", pages="e63262", keywords="e-mental health", keywords="digital mental health service", keywords="mental health", keywords="digital health", keywords="internet intervention", keywords="monitoring mental health", keywords="monitor", keywords="e-coach", keywords="coaching", keywords="grieve", keywords="mourn", keywords="old", keywords="affective states", keywords="artificial intelligence", keywords="predictive", keywords="repeatedly measured predictors in regression", keywords="fuzzy cognitive map", keywords="algorithm", keywords="AI", abstract="Background: Artificial intelligence (AI) tools hold much promise for mental health care by increasing the scalability and accessibility of care. However, current development and evaluation practices of AI tools limit their meaningfulness for health care contexts and therefore also the practical usefulness of such tools for professionals and clients alike. Objective: The aim of this study is to demonstrate the evaluation of an AI monitoring tool that detects the need for more intensive care in a web-based grief intervention for older mourners who have lost their spouse, with the goal of moving toward meaningful evaluation of AI tools in e-mental health. Method: We leveraged the insights from three evaluation approaches: (1) the F1-score evaluated the tool's capacity to classify user monitoring parameters as either in need of more intensive support or recommendable to continue using the web-based grief intervention as is; (2) we used linear regression to assess the predictive value of users' monitoring parameters for clinical changes in grief, depression, and loneliness over the course of a 10-week intervention; and (3) we collected qualitative experience data from e-coaches (N=4) who incorporated the monitoring in their weekly email guidance during the 10-week intervention. Results: Based on n=174 binary recommendation decisions, the F1-score of the monitoring tool was 0.91. Due to minimal change in depression and loneliness scores after the 10-week intervention, only 1 linear regression was conducted. The difference score in grief before and after the intervention was included as a dependent variable. Participants' (N=21) mean score on the self-report monitoring and the estimated slope of individually fitted growth curves and its standard error (ie, participants' response pattern to the monitoring questions) were used as predictors. Only the mean monitoring score exhibited predictive value for the observed change in grief (R2=1.19, SE 0.33; t16=3.58, P=.002). The e-coaches appreciated the monitoring tool as an opportunity to confirm their initial impression about intervention participants, personalize their email guidance, and detect when participants' mental health deteriorated during the intervention. Conclusions: The monitoring tool evaluated in this paper identified a need for more intensive support reasonably well in a nonclinical sample of older mourners, had some predictive value for the change in grief symptoms during a 10-week intervention, and was appreciated as an additional source of mental health information by e-coaches who supported mourners during the intervention. Each evaluation approach in this paper came with its own set of limitations, including (1) skewed class distributions in prediction tasks based on real-life health data and (2) choosing meaningful statistical analyses based on clinical trial designs that are not targeted at evaluating AI tools. However, combining multiple evaluation methods facilitates drawing meaningful conclusions about the clinical value of AI monitoring tools for their intended mental health context. ", doi="10.2196/63262", url="https://formative.jmir.org/2024/1/e63262" } @Article{info:doi/10.2196/64380, author="Varghese, Anna Mahima and Sharma, Poonam and Patwardhan, Maitreyee", title="Public Perception on Artificial Intelligence--Driven Mental Health Interventions: Survey Research", journal="JMIR Form Res", year="2024", month="Nov", day="28", volume="8", pages="e64380", keywords="public perception", keywords="artificial intelligence", keywords="AI", keywords="AI-driven", keywords="human-driven", keywords="mental health inteventions", keywords="mental health stigma", keywords="trust in AI", keywords="digital health", keywords="India", keywords="mobile phone", abstract="Background: Artificial intelligence (AI) has become increasingly important in health care, generating both curiosity and concern. With a doctor-patient ratio of 1:834 in India, AI has the potential to alleviate a significant health care burden. Public perception plays a crucial role in shaping attitudes that can facilitate the adoption of new technologies. Similarly, the acceptance of AI-driven mental health interventions is crucial in determining their effectiveness and widespread adoption. Therefore, it is essential to study public perceptions and usage of existing AI-driven mental health interventions by exploring user experiences and opinions on their future applicability, particularly in comparison to traditional, human-based interventions. Objective: This study aims to explore the use, perception, and acceptance of AI-driven mental health interventions in comparison to traditional, human-based interventions. Methods: A total of 466 adult participants from India voluntarily completed a 30-item web-based survey on the use and perception of AI-based mental health interventions between November and December 2023. Results: Of the 466 respondents, only 163 (35\%) had ever consulted a mental health professional. Additionally, 305 (65.5\%) reported very low knowledge of AI-driven interventions. In terms of trust, 247 (53\%) expressed a moderate level of Trust in AI-Driven Mental Health Interventions, while only 24 (5.2\%) reported a high level of trust. By contrast, 114 (24.5\%) reported high trust and 309 (66.3\%) reported moderate Trust in Human-Based Mental Health Interventions; 242 (51.9\%) participants reported a high level of stigma associated with using human-based interventions, compared with only 50 (10.7\%) who expressed concerns about stigma related to AI-driven interventions. Additionally, 162 (34.8\%) expressed a positive outlook toward the future use and social acceptance of AI-based interventions. The majority of respondents indicated that AI could be a useful option for providing general mental health tips and conducting initial assessments. The key benefits of AI highlighted by participants were accessibility, cost-effectiveness, 24/7 availability, and reduced stigma. Major concerns included data privacy, security, the lack of human touch, and the potential for misdiagnosis. Conclusions: There is a general lack of awareness about AI-driven mental health interventions. However, AI shows potential as a viable option for prevention, primary assessment, and ongoing mental health maintenance. Currently, people tend to trust traditional mental health practices more. Stigma remains a significant barrier to accessing traditional mental health services. Currently, the human touch remains an indispensable aspect of human-based mental health care, one that AI cannot replace. However, integrating AI with human mental health professionals is seen as a compelling model. AI is positively perceived in terms of accessibility, availability, and destigmatization. Knowledge and perceived trustworthiness are key factors influencing the acceptance and effectiveness of AI-driven mental health interventions. ", doi="10.2196/64380", url="https://formative.jmir.org/2024/1/e64380" } @Article{info:doi/10.2196/54557, author="Zheng, Jiakun and Wang, Junjie and Shen, Jing and An, Ruopeng", title="Artificial Intelligence Applications to Measure Food and Nutrient Intakes: Scoping Review", journal="J Med Internet Res", year="2024", month="Nov", day="28", volume="26", pages="e54557", keywords="food", keywords="nutrient", keywords="diet", keywords="artificial intelligence", keywords="machine learning", keywords="deep learning", keywords="neural networks", keywords="computer vision", keywords="natural language processing", keywords="measurement", keywords="AI", keywords="food intake", keywords="systematic literature", keywords="dietary assessments", keywords="AI-based", keywords="disease management", keywords="mobile phone", abstract="Background: Accurate measurement of food and nutrient intake is crucial for nutrition research, dietary surveillance, and disease management, but traditional methods such as 24-hour dietary recalls, food diaries, and food frequency questionnaires are often prone to recall error and social desirability bias, limiting their reliability. With the advancement of artificial intelligence (AI), there is potential to overcome these limitations through automated, objective, and scalable dietary assessment techniques. However, the effectiveness and challenges of AI applications in this domain remain inadequately explored. Objective: This study aimed to conduct a scoping review to synthesize existing literature on the efficacy, accuracy, and challenges of using AI tools in assessing food and nutrient intakes, offering insights into their current advantages and areas of improvement. Methods: This review followed the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews) guidelines. A comprehensive literature search was conducted in 4 databases---PubMed, Web of Science, Cochrane Library, and EBSCO---covering publications from the databases' inception to June 30, 2023. Studies were included if they used modern AI approaches to assess food and nutrient intakes in human subjects. Results: The 25 included studies, published between 2010 and 2023, involved sample sizes ranging from 10 to 38,415 participants. These studies used a variety of input data types, including food images (n=10), sound and jaw motion data from wearable devices (n=9), and text data (n=4), with 2 studies combining multiple input types. AI models applied included deep learning (eg, convolutional neural networks), machine learning (eg, support vector machines), and hybrid approaches. Applications were categorized into dietary intake assessment, food detection, nutrient estimation, and food intake prediction. Food detection accuracies ranged from 74\% to 99.85\%, and nutrient estimation errors varied between 10\% and 15\%. For instance, the RGB-D (Red, Green, Blue-Depth) fusion network achieved a mean absolute error of 15\% in calorie estimation, and a sound-based classification model reached up to 94\% accuracy in detecting food intake based on jaw motion and chewing patterns. In addition, AI-based systems provided real-time monitoring capabilities, improving the precision of dietary assessments and demonstrating the potential to reduce recall bias typically associated with traditional self-report methods. Conclusions: While AI demonstrated significant advantages in improving accuracy, reducing labor, and enabling real-time monitoring, challenges remain in adapting to diverse food types, ensuring algorithmic fairness, and addressing data privacy concerns. The findings suggest that AI has transformative potential for dietary assessment at both individual and population levels, supporting precision nutrition and chronic disease management. Future research should focus on enhancing the robustness of AI models across diverse dietary contexts and integrating biological sensors for a holistic dietary assessment approach. ", doi="10.2196/54557", url="https://www.jmir.org/2024/1/e54557" } @Article{info:doi/10.2196/62747, author="Campbell, Marie Amy and Hauton, Chris and van Aerle, Ronny and Martinez-Urtaza, Jaime", title="Eco-Evolutionary Drivers of Vibrio parahaemolyticus Sequence Type 3 Expansion: Retrospective Machine Learning Approach", journal="JMIR Bioinform Biotech", year="2024", month="Nov", day="28", volume="5", pages="e62747", keywords="pathogen expansion", keywords="climate change", keywords="machine learning", keywords="ecology", keywords="evolution", keywords="vibrio parahaemolyticus", keywords="sequencing", keywords="sequence type 3", keywords="VpST3", keywords="genomics", abstract="Background: Environmentally sensitive pathogens exhibit ecological and evolutionary responses to climate change that result in the emergence and global expansion of well-adapted variants. It is imperative to understand the mechanisms that facilitate pathogen emergence and expansion, as well as the drivers behind the mechanisms, to understand and prepare for future pandemic expansions. Objective: The unique, rapid, global expansion of a clonal complex of Vibrio parahaemolyticus (a marine bacterium causing gastroenteritis infections) named Vibrio parahaemolyticus sequence type 3 (VpST3) provides an opportunity to explore the eco-evolutionary drivers of pathogen expansion. Methods: The global expansion of VpST3 was reconstructed using VpST3 genomes, which were then classified into metrics characterizing the stages of this expansion process, indicative of the stages of emergence and establishment. We used machine learning, specifically a random forest classifier, to test a range of ecological and evolutionary drivers for their potential in predicting VpST3 expansion dynamics. Results: We identified a range of evolutionary features, including mutations in the core genome and accessory gene presence, associated with expansion dynamics. A range of random forest classifier approaches were tested to predict expansion classification metrics for each genome. The highest predictive accuracies (ranging from 0.722 to 0.967) were achieved for models using a combined eco-evolutionary approach. While population structure and the difference between introduced and established isolates could be predicted to a high accuracy, our model reported multiple false positives when predicting the success of an introduced isolate, suggesting potential limiting factors not represented in our eco-evolutionary features. Regional models produced for 2 countries reporting the most VpST3 genomes had varying success, reflecting the impacts of class imbalance. Conclusions: These novel insights into evolutionary features and ecological conditions related to the stages of VpST3 expansion showcase the potential of machine learning models using genomic data and will contribute to the future understanding of the eco-evolutionary pathways of climate-sensitive pathogens. ", doi="10.2196/62747", url="https://bioinform.jmir.org/2024/1/e62747", url="http://www.ncbi.nlm.nih.gov/pubmed/39607996" } @Article{info:doi/10.2196/54641, author="Song, Kyungchul and Ko, Taehoon and Chae, Wook Hyun and Oh, Suk Jun and Kim, Ho-Seong and Shin, Joo Hyun and Kim, Jeong-Ho and Na, Ji-Hoon and Park, Jung Chae and Sohn, Beomseok", title="Development and Validation of a Prediction Model Using Sella Magnetic Resonance Imaging--Based Radiomics and Clinical Parameters for the Diagnosis of Growth Hormone Deficiency and Idiopathic Short Stature: Cross-Sectional, Multicenter Study", journal="J Med Internet Res", year="2024", month="Nov", day="27", volume="26", pages="e54641", keywords="dwarfism", keywords="pituitary", keywords="idiopathic short stature", keywords="child", keywords="adolescent", keywords="machine learning", keywords="magnetic resonance imaging", keywords="MRI", abstract="Background: Growth hormone deficiency (GHD) and idiopathic short stature (ISS) are the major etiologies of short stature in children. For the diagnosis of GHD and ISS, meticulous evaluations are required, including growth hormone provocation tests, which are invasive and burdensome for children. Additionally, sella magnetic resonance imaging (MRI) is necessary for assessing etiologies of GHD, which cannot evaluate hormonal secretion. Recently, radiomics has emerged as a revolutionary technique that uses mathematical algorithms to extract various features for the quantitative analysis of medical images. Objective: This study aimed to develop a machine learning--based model using sella MRI--based radiomics and clinical parameters to diagnose GHD and ISS. Methods: A total of 293 children with short stature who underwent sella MRI and growth hormone provocation tests were included in the training set, and 47 children who met the same inclusion criteria were enrolled in the test set from different hospitals for this study. A total of 186 radiomic features were extracted from the pituitary glands using a semiautomatic segmentation process for both the T2-weighted and contrast-enhanced T1-weighted image. The clinical parameters included auxological data, insulin-like growth factor-I, and bone age. The extreme gradient boosting algorithm was used to train the prediction models. Internal validation was conducted using 5-fold cross-validation on the training set, and external validation was conducted on the test set. Model performance was assessed by plotting the area under the receiver operating characteristic curve. The mean absolute Shapley values were computed to quantify the impact of each parameter. Results: The area under the receiver operating characteristic curves (95\% CIs) of the clinical, radiomics, and combined models were 0.684 (0.590-0.778), 0.691 (0.620-0.762), and 0.830 (0.741-0.919), respectively, in the external validation. Among the clinical parameters, the major contributing factors to prediction were BMI SD score (SDS), chronological age--bone age, weight SDS, growth velocity, and insulin-like growth factor-I SDS in the clinical model. In the combined model, radiomic features including maximum probability from a T2-weighted image and run length nonuniformity normalized from a T2-weighted image added incremental value to the prediction (combined model vs clinical model, P=.03; combined model vs radiomics model, P=.02). The code for our model is available in a public repository on GitHub. Conclusions: Our model combining both radiomics and clinical parameters can accurately predict GHD from ISS, which was also proven in the external validation. These findings highlight the potential of machine learning--based models using radiomics and clinical parameters for diagnosing GHD and ISS. ", doi="10.2196/54641", url="https://www.jmir.org/2024/1/e54641" } @Article{info:doi/10.2196/58666, author="Sakamoto, Tetsu and Harada, Yukinori and Shimizu, Taro", title="Facilitating Trust Calibration in Artificial Intelligence--Driven Diagnostic Decision Support Systems for Determining Physicians' Diagnostic Accuracy: Quasi-Experimental Study", journal="JMIR Form Res", year="2024", month="Nov", day="27", volume="8", pages="e58666", keywords="trust calibration", keywords="artificial intelligence", keywords="diagnostic accuracy", keywords="diagnostic decision support", keywords="decision support", keywords="diagnosis", keywords="diagnostic", keywords="chart", keywords="history", keywords="reliable", keywords="reliability", keywords="accurate", keywords="accuracy", keywords="AI", abstract="Background: Diagnostic errors are significant problems in medical care. Despite the usefulness of artificial intelligence (AI)--based diagnostic decision support systems, the overreliance of physicians on AI-generated diagnoses may lead to diagnostic errors. Objective: We investigated the safe use of AI-based diagnostic decision support systems with trust calibration by adjusting trust levels to match the actual reliability of AI. Methods: A quasi-experimental study was conducted at Dokkyo Medical University, Japan, with physicians allocated (1:1) to the intervention and control groups. A total of 20 clinical cases were created based on the medical histories recorded by an AI-driven automated medical history--taking system from actual patients who visited a community-based hospital in Japan. The participants reviewed the medical histories of 20 clinical cases generated by an AI-driven automated medical history--taking system with an AI-generated list of 10 differential diagnoses and provided 1 to 3 possible diagnoses. Physicians were asked whether the final diagnosis was in the AI-generated list of 10 differential diagnoses in the intervention group, which served as the trust calibration. We analyzed the diagnostic accuracy of physicians and the correctness of the trust calibration in the intervention group. We also investigated the relationship between the accuracy of the trust calibration and the diagnostic accuracy of physicians, and the physicians' confidence level regarding the use of AI. Results: Among the 20 physicians assigned to the intervention (n=10) and control (n=10) groups, the mean age was 30.9 (SD 3.9) years and 31.7 (SD 4.2) years, the proportion of men was 80\% and 60\%, and the mean postgraduate year was 5.8 (SD 2.9) and 7.2 (SD 4.6), respectively, with no significant differences. The physicians' diagnostic accuracy was 41.5\% in the intervention group and 46\% in the control group, with no significant difference (95\% CI ?0.75 to 2.55; P=.27). The overall accuracy of the trust calibration was only 61.5\%, and despite correct calibration, the diagnostic accuracy was 54.5\%. In the multivariate logistic regression model, the accuracy of the trust calibration was a significant contributor to the diagnostic accuracy of physicians (adjusted odds ratio 5.90, 95\% CI 2.93?12.46; P<.001). The mean confidence level for AI was 72.5\% in the intervention group and 45\% in the control group, with no significant difference. Conclusions: Trust calibration did not significantly improve physicians' diagnostic accuracy when considering the differential diagnoses generated by reading medical histories and the possible differential diagnosis lists of an AI-driven automated medical history--taking system. As this was a formative study, the small sample size and suboptimal trust calibration methods may have contributed to the lack of significant differences. This study highlights the need for a larger sample size and the implementation of supportive measures of trust calibration. ", doi="10.2196/58666", url="https://formative.jmir.org/2024/1/e58666" } @Article{info:doi/10.2196/58275, author="O'Malley, Andrew and Veenhuizen, Miriam and Ahmed, Ayla", title="Ensuring Appropriate Representation in Artificial Intelligence--Generated Medical Imagery: Protocol for a Methodological Approach to Address Skin Tone Bias", journal="JMIR AI", year="2024", month="Nov", day="27", volume="3", pages="e58275", keywords="artificial intelligence", keywords="generative AI", keywords="AI images", keywords="dermatology", keywords="anatomy", keywords="medical education", keywords="medical imaging", keywords="skin", keywords="skin tone", keywords="United States", keywords="educational material", keywords="psoriasis", keywords="digital imagery", abstract="Background: In medical education, particularly in anatomy and dermatology, generative artificial intelligence (AI) can be used to create customized illustrations. However, the underrepresentation of darker skin tones in medical textbooks and elsewhere, which serve as training data for AI, poses a significant challenge in ensuring diverse and inclusive educational materials. Objective: This study aims to evaluate the extent of skin tone diversity in AI-generated medical images and to test whether the representation of skin tones can be improved by modifying AI prompts to better reflect the demographic makeup of the US population. Methods: In total, 2 standard AI models (Dall-E [OpenAI] and Midjourney [Midjourney Inc]) each generated 100 images of people with psoriasis. In addition, a custom model was developed that incorporated a prompt injection aimed at ``forcing'' the AI (Dall-E 3) to reflect the skin tone distribution of the US population according to the 2012 American National Election Survey. This custom model generated another set of 100 images. The skin tones in these images were assessed by 3 researchers using the New Immigrant Survey skin tone scale, with the median value representing each image. A chi-square goodness of fit analysis compared the skin tone distributions from each set of images to that of the US population. Results: The standard AI models (Dalle-3 and Midjourney) demonstrated a significant difference between the expected skin tones of the US population and the observed tones in the generated images (P<.001). Both standard AI models overrepresented lighter skin. Conversely, the custom model with the modified prompt yielded a distribution of skin tones that closely matched the expected demographic representation, showing no significant difference (P=.04). Conclusions: This study reveals a notable bias in AI-generated medical images, predominantly underrepresenting darker skin tones. This bias can be effectively addressed by modifying AI prompts to incorporate real-life demographic distributions. The findings emphasize the need for conscious efforts in AI development to ensure diverse and representative outputs, particularly in educational and medical contexts. Users of generative AI tools should be aware that these biases exist, and that similar tendencies may also exist in other types of generative AI (eg, large language models) and in other characteristics (eg, sex, gender, culture, and ethnicity). Injecting demographic data into AI prompts may effectively counteract these biases, ensuring a more accurate representation of the general population. ", doi="10.2196/58275", url="https://ai.jmir.org/2024/1/e58275" } @Article{info:doi/10.2196/54357, author="Cavero-Redondo, Iv{\'a}n and Martinez-Rodrigo, Arturo and Saz-Lara, Alicia and Moreno-Herraiz, Nerea and Casado-Vicente, Veronica and Gomez-Sanchez, Leticia and Garcia-Ortiz, Luis and Gomez-Marcos, A. Manuel and ", title="Antihypertensive Drug Recommendations for Reducing Arterial Stiffness in Patients With Hypertension: Machine Learning--Based Multicohort (RIGIPREV) Study", journal="J Med Internet Res", year="2024", month="Nov", day="25", volume="26", pages="e54357", keywords="antihypertensive", keywords="drugs", keywords="models", keywords="patients", keywords="pulse wave velocity", keywords="recommendations", keywords="hypertension", keywords="machine learning", keywords="drug recommendations", keywords="arterial stiffness", keywords="RIGIPREV", abstract="Background: High systolic blood pressure is one of the leading global risk factors for mortality, contributing significantly to cardiovascular diseases. Despite advances in treatment, a large proportion of patients with hypertension do not achieve optimal blood pressure control. Arterial stiffness (AS), measured by pulse wave velocity (PWV), is an independent predictor of cardiovascular events and overall mortality. Various antihypertensive drugs exhibit differential effects on PWV, but the extent to which these effects vary depending on individual patient characteristics is not well understood. Given the complexity of selecting the most appropriate antihypertensive medication for reducing PWV, machine learning (ML) techniques offer an opportunity to improve personalized treatment recommendations. Objective: This study aims to develop an ML model that provides personalized recommendations for antihypertensive medications aimed at reducing PWV. The model considers individual patient characteristics, such as demographic factors, clinical data, and cardiovascular measurements, to identify the most suitable antihypertensive agent for improving AS. Methods: This study, known as the RIGIPREV study, used data from the EVA, LOD-DIABETES, and EVIDENT studies involving individuals with hypertension with baseline and follow-up measurements. Antihypertensive drugs were grouped into classes such as angiotensin-converting enzyme inhibitors (ACEIs), angiotensin receptor blockers (ARBs), $\beta$-blockers, diuretics, and combinations of diuretics with ACEIs or ARBs. The primary outcomes were carotid-femoral and brachial-ankle PWV, while the secondary outcomes included various cardiovascular, anthropometric, and biochemical parameters. A multioutput regressor using 6 random forest models was used to predict the impact of each antihypertensive class on PWV reduction. Model performance was evaluated using the coefficient of determination (R2) and mean squared error. Results: The random forest models exhibited strong predictive capabilities, with internal validation yielding R2 values between 0.61 and 0.74, while external validation showed a range of 0.26 to 0.46. The mean squared values ranged from 0.08 to 0.22 for internal validation and from 0.29 to 0.45 for external validation. Variable importance analysis revealed that glycated hemoglobin and weight were the most critical predictors for ACEIs, while carotid-femoral PWV and total cholesterol were key variables for ARBs. The decision tree model achieved an accuracy of 84.02\% in identifying the most suitable antihypertensive drug based on individual patient characteristics. Furthermore, the system's recommendations for ARBs matched 55.3\% of patients' original prescriptions. Conclusions: This study demonstrates the utility of ML techniques in providing personalized treatment recommendations for antihypertensive therapy. By accounting for individual patient characteristics, the model improves the selection of drugs that control blood pressure and reduce AS. These findings could significantly aid clinicians in optimizing hypertension management and reducing cardiovascular risk. However, further studies with larger and more diverse populations are necessary to validate these results and extend the model's applicability. ", doi="10.2196/54357", url="https://www.jmir.org/2024/1/e54357", url="http://www.ncbi.nlm.nih.gov/pubmed/39585738" } @Article{info:doi/10.2196/59564, author="Chua, Chien Mei and Hadimaja, Matthew and Wong, Jill and Mukherjee, Subhra Sankha and Foussat, Agathe and Chan, Daniel and Nandal, Umesh and Yap, Fabian", title="Exploring the Use of a Length AI Algorithm to Estimate Children's Length from Smartphone Images in a Real-World Setting: Algorithm Development and Usability Study", journal="JMIR Pediatr Parent", year="2024", month="Nov", day="22", volume="7", pages="e59564", keywords="computer vision", keywords="length estimation", keywords="artificial intelligence", keywords="smartphone images", keywords="children", keywords="AI", keywords="algorithm", keywords="imaging", keywords="height", keywords="length", keywords="measure", keywords="pediatric", keywords="infant", keywords="neonatal", keywords="newborn", keywords="smartphone", keywords="mHealth", keywords="mobile health", keywords="mobile phone", abstract="Background: Length measurement in young children younger than 18 months is important for monitoring growth and development. Accurate length measurement requires proper equipment, standardized methods, and trained personnel. In addition, length measurement requires young children's cooperation, making it particularly challenging during infancy and toddlerhood. Objective: This study aimed to develop a length artificial intelligence (LAI) algorithm to aid users in determining recumbent length conveniently from smartphone images and explore its performance and suitability for personal and clinical use. Methods: This proof-of-concept study in healthy children (aged 0-18 months) was performed at KK Women's and Children's Hospital, Singapore, from November 2021 to March 2022. Smartphone images were taken by parents and investigators. Standardized length-board measurements were taken by trained investigators. Performance was evaluated by comparing the tool's image-based length estimations with length-board measurements (bias [mean error, mean difference between measured and predicted length]; absolute error [magnitude of error]). Prediction performance was evaluated on an individual-image basis and participant-averaged basis. User experience was collected through questionnaires. Results: A total of 215 participants (median age 4.4, IQR 1.9-9.7 months) were included. The tool produced a length prediction for 99.4\% (2211/2224) of photos analyzed. The mean absolute error was 2.47 cm for individual image predictions and 1.77 cm for participant-averaged predictions. Investigators and parents reported no difficulties in capturing the required photos for most participants (182/215, 84.7\% participants and 144/200, 72\% participants, respectively). Conclusions: The LAI algorithm is an accessible and novel way of estimating children's length from smartphone images without the need for specialized equipment or trained personnel. The LAI algorithm's current performance and ease of use suggest its potential for use by parents or caregivers with an accuracy approaching what is typically achieved in general clinics or community health settings. The results show that the algorithm is acceptable for use in a personal setting, serving as a proof of concept for use in clinical settings. Trial Registration: ClinicalTrials.gov NCT05079776; https://clinicaltrials.gov/ct2/show/NCT05079776 ", doi="10.2196/59564", url="https://pediatrics.jmir.org/2024/1/e59564" } @Article{info:doi/10.2196/60334, author="Tang, Jian and Huang, Zikun and Xu, Hongzhen and Zhang, Hao and Huang, Hailing and Tang, Minqiong and Luo, Pengsheng and Qin, Dong", title="Chinese Clinical Named Entity Recognition With Segmentation Synonym Sentence Synthesis Mechanism: Algorithm Development and Validation", journal="JMIR Med Inform", year="2024", month="Nov", day="21", volume="12", pages="e60334", keywords="clinical named entity recognition", keywords="word embedding", keywords="Chinese electronic medical records", keywords="RoBERTa", keywords="entity recognition", keywords="segmentation", keywords="natural language processing", keywords="AI", keywords="artificial intelligence", keywords="dataset", keywords="dataset augmentation", keywords="algorithm", keywords="entity", keywords="EMR", abstract="Background: Clinical named entity recognition (CNER) is a fundamental task in natural language processing used to extract named entities from electronic medical record texts. In recent years, with the continuous development of machine learning, deep learning models have replaced traditional machine learning and template-based methods, becoming widely applied in the CNER field. However, due to the complexity of clinical texts, the diversity and large quantity of named entity types, and the unclear boundaries between different entities, existing advanced methods rely to some extent on annotated databases and the scale of embedded dictionaries. Objective: This study aims to address the issues of data scarcity and labeling difficulties in CNER tasks by proposing a dataset augmentation algorithm based on proximity word calculation. Methods: We propose a Segmentation Synonym Sentence Synthesis (SSSS) algorithm based on neighboring vocabulary, which leverages existing public knowledge without the need for manual expansion of specialized domain dictionaries. Through lexical segmentation, the algorithm replaces new synonymous vocabulary by recombining from vast natural language data, achieving nearby expansion expressions of the dataset. We applied the SSSS algorithm to the Robustly Optimized Bidirectional Encoder Representations from Transformers Pretraining Approach (RoBERTa) + conditional random field (CRF) and RoBERTa + Bidirectional Long Short-Term Memory (BiLSTM) + CRF models and evaluated our models (SSSS + RoBERTa + CRF; SSSS + RoBERTa + BiLSTM + CRF) on the China Conference on Knowledge Graph and Semantic Computing (CCKS) 2017 and 2019 datasets. Results: Our experiments demonstrated that the models SSSS + RoBERTa + CRF and SSSS + RoBERTa + BiLSTM + CRF achieved F1-scores of 91.30\% and 91.35\% on the CCKS-2017 dataset, respectively. They also achieved F1-scores of 83.21\% and 83.01\% on the CCKS-2019 dataset, respectively. Conclusions: The experimental results indicated that our proposed method successfully expanded the dataset and remarkably improved the performance of the model, effectively addressing the challenges of data acquisition, annotation difficulties, and insufficient model generalization performance. ", doi="10.2196/60334", url="https://medinform.jmir.org/2024/1/e60334" } @Article{info:doi/10.2196/59480, author="Gopukumar, Deepika and Menon, Nirup and Schoen, W. Martin", title="Medication Prescription Policy for US Veterans With Metastatic Castration-Resistant Prostate Cancer: Causal Machine Learning Approach", journal="JMIR Med Inform", year="2024", month="Nov", day="19", volume="12", pages="e59480", keywords="prostate cancer", keywords="metastatic castration resistant prostate cancer", keywords="causal survival forest", keywords="machine learning", keywords="heterogeneity", keywords="prescription policy tree", keywords="oncology", keywords="pharmacology", abstract="Background: Prostate cancer is the second leading cause of death among American men. If detected and treated at an early stage, prostate cancer is often curable. However, an advanced stage such as metastatic castration-resistant prostate cancer (mCRPC) has a high risk of mortality. Multiple treatment options exist, the most common included docetaxel, abiraterone, and enzalutamide. Docetaxel is a cytotoxic chemotherapy, whereas abiraterone and enzalutamide are androgen receptor pathway inhibitors (ARPI). ARPIs are preferred over docetaxel due to lower toxicity. No study has used machine learning with patients' demographics, test results, and comorbidities to identify heterogeneous treatment rules that might improve the survival duration of patients with mCRPC. Objective: This study aimed to measure patient-level heterogeneity in the association of medication prescribed with overall survival duration (in the form of follow-up days) and arrive at a set of medication prescription rules using patient demographics, test results, and comorbidities. Methods: We excluded patients with mCRPC who were on docetaxel, cabaxitaxel, mitoxantrone, and sipuleucel-T either before or after the prescription of an ARPI. We included only the African American and white populations. In total, 2886 identified veterans treated for mCRPC who were prescribed either abiraterone or enzalutamide as the first line of treatment from 2014 to 2017, with follow-up until 2020, were analyzed. We used causal survival forests for analysis. The unit level of analysis was the patient. The primary outcome of this study was follow-up days indicating survival duration while on the first-line medication. After estimating the treatment effect, a prescription policy tree was constructed. Results: For 2886 veterans, enzalutamide is associated with an average of 59.94 (95\% CI 35.60-84.28) more days of survival than abiraterone. The increase in overall survival duration for the 2 drugs varied across patient demographics, test results, and comorbidities. Two data-driven subgroups of patients were identified by ranking them on their augmented inverse-propensity weighted (AIPW) scores. The average AIPW scores for the 2 subgroups were 19.36 (95\% CI --16.93 to 55.65) and 100.68 (95\% CI 62.46-138.89). Based on visualization and t test, the AIPW score for low and high subgroups was significant (P=.003), thereby supporting heterogeneity. The analysis resulted in a set of prescription rules for the 2 ARPIs based on a few covariates available to the physicians at the time of prescription. Conclusions: This study of 2886 veterans showed evidence of heterogeneity and that survival days may be improved for certain patients with mCRPC based on the medication prescribed. Findings suggest that prescription rules based on the patient characteristics, laboratory test results, and comorbidities available to the physician at the time of prescription could improve survival by providing personalized treatment decisions. ", doi="10.2196/59480", url="https://medinform.jmir.org/2024/1/e59480" } @Article{info:doi/10.2196/49724, author="Cho, Na Ha and Jun, Joon Tae and Kim, Young-Hak and Kang, Heejun and Ahn, Imjin and Gwon, Hansle and Kim, Yunha and Seo, Jiahn and Choi, Heejung and Kim, Minkyoung and Han, Jiye and Kee, Gaeun and Park, Seohyun and Ko, Soyoung", title="Task-Specific Transformer-Based Language Models in Health Care: Scoping Review", journal="JMIR Med Inform", year="2024", month="Nov", day="18", volume="12", pages="e49724", keywords="transformer-based language models", keywords="medicine", keywords="health care", keywords="medical language model", abstract="Background: Transformer-based language models have shown great potential to revolutionize health care by advancing clinical decision support, patient interaction, and disease prediction. However, despite their rapid development, the implementation of transformer-based language models in health care settings remains limited. This is partly due to the lack of a comprehensive review, which hinders a systematic understanding of their applications and limitations. Without clear guidelines and consolidated information, both researchers and physicians face difficulties in using these models effectively, resulting in inefficient research efforts and slow integration into clinical workflows. Objective: This scoping review addresses this gap by examining studies on medical transformer-based language models and categorizing them into 6 tasks: dialogue generation, question answering, summarization, text classification, sentiment analysis, and named entity recognition. Methods: We conducted a scoping review following the Cochrane scoping review protocol. A comprehensive literature search was performed across databases, including Google Scholar and PubMed, covering publications from January 2017 to September 2024. Studies involving transformer-derived models in medical tasks were included. Data were categorized into 6 key tasks. Results: Our key findings revealed both advancements and critical challenges in applying transformer-based models to health care tasks. For example, models like MedPIR involving dialogue generation show promise but face privacy and ethical concerns, while question-answering models like BioBERT improve accuracy but struggle with the complexity of medical terminology. The BioBERTSum summarization model aids clinicians by condensing medical texts but needs better handling of long sequences. Conclusions: This review attempted to provide a consolidated understanding of the role of transformer-based language models in health care and to guide future research directions. By addressing current challenges and exploring the potential for real-world applications, we envision significant improvements in health care informatics. Addressing the identified challenges and implementing proposed solutions can enable transformer-based language models to significantly improve health care delivery and patient outcomes. Our review provides valuable insights for future research and practical applications, setting the stage for transformative advancements in medical informatics. ", doi="10.2196/49724", url="https://medinform.jmir.org/2024/1/e49724" } @Article{info:doi/10.2196/53616, author="Chustecki, Margaret", title="Benefits and Risks of AI in Health Care: Narrative Review", journal="Interact J Med Res", year="2024", month="Nov", day="18", volume="13", pages="e53616", keywords="artificial intelligence", keywords="safety risks", keywords="biases", keywords="AI", keywords="benefit", keywords="risk", keywords="health care", keywords="safety", keywords="ethics", keywords="transparency", keywords="data privacy", keywords="accuracy", abstract="Background: The integration of artificial intelligence (AI) into health care has the potential to transform the industry, but it also raises ethical, regulatory, and safety concerns. This review paper provides an in-depth examination of the benefits and risks associated with AI in health care, with a focus on issues like biases, transparency, data privacy, and safety. Objective: This study aims to evaluate the advantages and drawbacks of incorporating AI in health care. This assessment centers on the potential biases in AI algorithms, transparency challenges, data privacy issues, and safety risks in health care settings. Methods: Studies included in this review were selected based on their relevance to AI applications in health care, focusing on ethical, regulatory, and safety considerations. Inclusion criteria encompassed peer-reviewed articles, reviews, and relevant research papers published in English. Exclusion criteria included non--peer-reviewed articles, editorials, and studies not directly related to AI in health care. A comprehensive literature search was conducted across 8 databases: OVID MEDLINE, OVID Embase, OVID PsycINFO, EBSCO CINAHL Plus with Full Text, ProQuest Sociological Abstracts, ProQuest Philosopher's Index, ProQuest Advanced Technologies \& Aerospace, and Wiley Cochrane Library. The search was last updated on June 23, 2023. Results were synthesized using qualitative methods to identify key themes and findings related to the benefits and risks of AI in health care. Results: The literature search yielded 8796 articles. After removing duplicates and applying the inclusion and exclusion criteria, 44 studies were included in the qualitative synthesis. This review highlights the significant promise that AI holds in health care, such as enhancing health care delivery by providing more accurate diagnoses, personalized treatment plans, and efficient resource allocation. However, persistent concerns remain, including biases ingrained in AI algorithms, a lack of transparency in decision-making, potential compromises of patient data privacy, and safety risks associated with AI implementation in clinical settings. Conclusions: In conclusion, while AI presents the opportunity for a health care revolution, it is imperative to address the ethical, regulatory, and safety challenges linked to its integration. Proactive measures are required to ensure that AI technologies are developed and deployed responsibly, striking a balance between innovation and the safeguarding of patient well-being. ", doi="10.2196/53616", url="https://www.i-jmr.org/2024/1/e53616" } @Article{info:doi/10.2196/65033, author="Li, Xingyuan and Liu, Ke and Lang, Yanlin and Chai, Zhonglin and Liu, Fang", title="Exploring the Potential of Claude 3 Opus in Renal Pathological Diagnosis: Performance Evaluation", journal="JMIR Med Inform", year="2024", month="Nov", day="15", volume="12", pages="e65033", keywords="artificial intelligence", keywords="Claude 3 Opus", keywords="renal pathology", keywords="diagnostic performance", keywords="large language model", keywords="LLM", keywords="performance evaluation", keywords="medical diagnosis", keywords="AI language model", keywords="diagnosis", keywords="pathology images", keywords="pathologist", keywords="clinical relevance", keywords="accuracy", keywords="language fluency", keywords="pathological diagnosis", abstract="Background: Artificial intelligence (AI) has shown great promise in assisting medical diagnosis, but its application in renal pathology remains limited. Objective: We evaluated the performance of an advanced AI language model, Claude 3 Opus (Anthropic), in generating diagnostic descriptions for renal pathological images. Methods: We carefully curated a dataset of 100 representative renal pathological images from the Diagnostic Atlas of Renal Pathology (3rd edition). The image selection aimed to cover a wide spectrum of common renal diseases, ensuring a balanced and comprehensive dataset. Claude 3 Opus generated diagnostic descriptions for each image, which were scored by 2 pathologists on clinical relevance, accuracy, fluency, completeness, and overall value. Results: Claude 3 Opus achieved a high mean score in language fluency (3.86) but lower scores in clinical relevance (1.75), accuracy (1.55), completeness (2.01), and overall value (1.75). Performance varied across disease types. Interrater agreement was substantial for relevance ($\kappa$=0.627) and overall value ($\kappa$=0.589) and moderate for accuracy ($\kappa$=0.485) and completeness ($\kappa$=0.458). Conclusions: Claude 3 Opus shows potential in generating fluent renal pathology descriptions but needs improvement in accuracy and clinical value. The AI's performance varied across disease types. Addressing the limitations of single-source data and incorporating comparative analyses with other AI approaches are essential steps for future research. Further optimization and validation are needed for clinical applications. ", doi="10.2196/65033", url="https://medinform.jmir.org/2024/1/e65033" } @Article{info:doi/10.2196/63356, author="Abbasgholizadeh Rahimi, Samira and Shrivastava, Richa and Brown-Johnson, Anita and Caidor, Pascale and Davies, Claire and Idrissi Janati, Amal and Kengne Talla, Pascaline and Madathil, Sreenath and Willie, M. Bettina and Emami, Elham", title="EDAI Framework for Integrating Equity, Diversity, and Inclusion Throughout the Lifecycle of AI to Improve Health and Oral Health Care: Qualitative Study", journal="J Med Internet Res", year="2024", month="Nov", day="15", volume="26", pages="e63356", keywords="equity, diversity, and inclusion", keywords="EDI", keywords="health care", keywords="oral health care", keywords="machine learning", keywords="artificial intelligence", keywords="AI", abstract="Background: Recent studies have identified significant gaps in equity, diversity, and inclusion (EDI) considerations within the lifecycle of artificial intelligence (AI), spanning from data collection and problem definition to implementation stages. Despite the recognized need for integrating EDI principles, there is currently no existing guideline or framework to support this integration in the AI lifecycle. Objective: This study aimed to address this gap by identifying EDI principles and indicators to be integrated into the AI lifecycle. The goal was to develop a comprehensive guiding framework to guide the development and implementation of future AI systems. Methods: This study was conducted in 3 phases. In phase 1, a comprehensive systematic scoping review explored how EDI principles have been integrated into AI in health and oral health care settings. In phase 2, a multidisciplinary team was established, and two 2-day, in-person international workshops with over 60 representatives from diverse backgrounds, expertise, and communities were conducted. The workshops included plenary presentations, round table discussions, and focused group discussions. In phase 3, based on the workshops' insights, the EDAI framework was developed and refined through iterative feedback from participants. The results of the initial systematic scoping review have been published separately, and this paper focuses on subsequent phases of the project, which is related to framework development. Results: In this study, we developed the EDAI framework, a comprehensive guideline that integrates EDI principles and indicators throughout the entire AI lifecycle. This framework addresses existing gaps at various stages, from data collection to implementation, and focuses on individual, organizational, and systemic levels. Additionally, we identified both the facilitators and barriers to integrating EDI within the AI lifecycle in health and oral health care. Conclusions: The developed EDAI framework provides a comprehensive, actionable guideline for integrating EDI principles into AI development and deployment. By facilitating the systematic incorporation of these principles, the framework supports the creation and implementation of AI systems that are not only technologically advanced but also sensitive to EDI principles. ", doi="10.2196/63356", url="https://www.jmir.org/2024/1/e63356", url="http://www.ncbi.nlm.nih.gov/pubmed/39546793" } @Article{info:doi/10.2196/58617, author="Wang, Renwu and Xu, Huimin and Zhang, Xupin", title="Impact of Image Content on Medical Crowdfunding Success: A Machine Learning Approach", journal="J Med Internet Res", year="2024", month="Nov", day="15", volume="26", pages="e58617", keywords="medical crowdfunding", keywords="visual analytics", keywords="machine learning", keywords="image content", keywords="crowdfunding success", abstract="Background: As crowdfunding sites proliferate, visual content often serves as the initial bridge connecting a project to its potential backers, underscoring the importance of image selection in effectively engaging an audience. Objective: This paper aims to explore the relationship between images and crowdfunding success in cancer-related crowdfunding projects. Methods: We used the Alibaba Cloud platform to detect individual features in images. In addition, we used the Recognize Anything Model to label images and obtain content tags. Furthermore, the discourse atomic topic model was used to generate image topics. After obtaining the image features and image content topics, we built regression models to investigate the factors that influence the results of crowdfunding success. Results: Images with a higher proportion of young people ($\beta$=0.0753; P<.001), a larger number of people ($\beta$=0.00822; P<.001), and a larger proportion of smiling faces ($\beta$=0.0446; P<.001) had a higher success rate. Image content related to good things and patient health also contributed to crowdfunding success ($\beta$=0.082, P<.001; and $\beta$=0.036, P<.001, respectively). In addition, the interaction between image topics and image characteristics had a significant effect on the final fundraising outcome. For example, when smiling faces are considered in conjunction with the image topics, using more smiling faces in the rest and play theme increased the amount of money raised ($\beta$=0.0152; P<.001). We also examined causality through a counterfactual analysis, which confirmed the influence of the variables on crowdfunding success, consistent with the results of our regression models. Conclusions: In the realm of web-based medical crowdfunding, the importance of uploaded images cannot be overstated. Image characteristics, including the number of people depicted and the presence of youth, significantly improve fundraising results. In addition, the thematic choice of images in cancer crowdfunding efforts has a profound impact. Images that evoke beauty and resonate with health issues are more likely to result in increased donations. However, it is critical to recognize that reinforcing character traits in images of different themes has different effects on the success of crowdfunding campaigns. ", doi="10.2196/58617", url="https://www.jmir.org/2024/1/e58617" } @Article{info:doi/10.2196/51432, author="Chotwanvirat, Phawinpon and Prachansuwan, Aree and Sridonpai, Pimnapanut and Kriengsinyos, Wantanee", title="Advancements in Using AI for Dietary Assessment Based on Food Images: Scoping Review", journal="J Med Internet Res", year="2024", month="Nov", day="15", volume="26", pages="e51432", keywords="image-assisted dietary assessment", keywords="artificial intelligence", keywords="dietary assessment", keywords="mobile phone", keywords="food intake", keywords="image recognition", keywords="portion size", abstract="Background: To accurately capture an individual's food intake, dietitians are often required to ask clients about their food frequencies and portions, and they have to rely on the client's memory, which can be burdensome. While taking food photos alongside food records can alleviate user burden and reduce errors in self-reporting, this method still requires trained staff to translate food photos into dietary intake data. Image-assisted dietary assessment (IADA) is an innovative approach that uses computer algorithms to mimic human performance in estimating dietary information from food images. This field has seen continuous improvement through advancements in computer science, particularly in artificial intelligence (AI). However, the technical nature of this field can make it challenging for those without a technical background to understand it completely. Objective: This review aims to fill the gap by providing a current overview of AI's integration into dietary assessment using food images. The content is organized chronologically and presented in an accessible manner for those unfamiliar with AI terminology. In addition, we discuss the systems' strengths and weaknesses and propose enhancements to improve IADA's accuracy and adoption in the nutrition community. Methods: This scoping review used PubMed and Google Scholar databases to identify relevant studies. The review focused on computational techniques used in IADA, specifically AI models, devices, and sensors, or digital methods for food recognition and food volume estimation published between 2008 and 2021. Results: A total of 522 articles were initially identified. On the basis of a rigorous selection process, 84 (16.1\%) articles were ultimately included in this review. The selected articles reveal that early systems, developed before 2015, relied on handcrafted machine learning algorithms to manage traditional sequential processes, such as segmentation, food identification, portion estimation, and nutrient calculations. Since 2015, these handcrafted algorithms have been largely replaced by deep learning algorithms for handling the same tasks. More recently, the traditional sequential process has been superseded by advanced algorithms, including multitask convolutional neural networks and generative adversarial networks. Most of the systems were validated for macronutrient and energy estimation, while only a few were capable of estimating micronutrients, such as sodium. Notably, significant advancements have been made in the field of IADA, with efforts focused on replicating humanlike performance. Conclusions: This review highlights the progress made by IADA, particularly in the areas of food identification and portion estimation. Advancements in AI techniques have shown great potential to improve the accuracy and efficiency of this field. However, it is crucial to involve dietitians and nutritionists in the development of these systems to ensure they meet the requirements and trust of professionals in the field. ", doi="10.2196/51432", url="https://www.jmir.org/2024/1/e51432" } @Article{info:doi/10.2196/22769, author="Wang, Leyao and Wan, Zhiyu and Ni, Congning and Song, Qingyuan and Li, Yang and Clayton, Ellen and Malin, Bradley and Yin, Zhijun", title="Applications and Concerns of ChatGPT and Other Conversational Large Language Models in Health Care: Systematic Review", journal="J Med Internet Res", year="2024", month="Nov", day="7", volume="26", pages="e22769", keywords="large language model", keywords="ChatGPT", keywords="artificial intelligence", keywords="natural language processing", keywords="health care", keywords="summarization", keywords="medical knowledge inquiry", keywords="reliability", keywords="bias", keywords="privacy", abstract="Background: The launch of ChatGPT (OpenAI) in November 2022 attracted public attention and academic interest to large language models (LLMs), facilitating the emergence of many other innovative LLMs. These LLMs have been applied in various fields, including health care. Numerous studies have since been conducted regarding how to use state-of-the-art LLMs in health-related scenarios. Objective: This review aims to summarize applications of and concerns regarding conversational LLMs in health care and provide an agenda for future research in this field. Methods: We used PubMed, ACM, and the IEEE digital libraries as primary sources for this review. We followed the guidance of PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) to screen and select peer-reviewed research articles that (1) were related to health care applications and conversational LLMs and (2) were published before September 1, 2023, the date when we started paper collection. We investigated these papers and classified them according to their applications and concerns. Results: Our search initially identified 820 papers according to targeted keywords, out of which 65 (7.9\%) papers met our criteria and were included in the review. The most popular conversational LLM was ChatGPT (60/65, 92\% of papers), followed by Bard (Google LLC; 1/65, 2\% of papers), LLaMA (Meta; 1/65, 2\% of papers), and other LLMs (6/65, 9\% papers). These papers were classified into four categories of applications: (1) summarization, (2) medical knowledge inquiry, (3) prediction (eg, diagnosis, treatment recommendation, and drug synergy), and (4) administration (eg, documentation and information collection), and four categories of concerns: (1) reliability (eg, training data quality, accuracy, interpretability, and consistency in responses), (2) bias, (3) privacy, and (4) public acceptability. There were 49 (75\%) papers using LLMs for either summarization or medical knowledge inquiry, or both, and there are 58 (89\%) papers expressing concerns about either reliability or bias, or both. We found that conversational LLMs exhibited promising results in summarization and providing general medical knowledge to patients with a relatively high accuracy. However, conversational LLMs such as ChatGPT are not always able to provide reliable answers to complex health-related tasks (eg, diagnosis) that require specialized domain expertise. While bias or privacy issues are often noted as concerns, no experiments in our reviewed papers thoughtfully examined how conversational LLMs lead to these issues in health care research. Conclusions: Future studies should focus on improving the reliability of LLM applications in complex health-related tasks, as well as investigating the mechanisms of how LLM applications bring bias and privacy issues. Considering the vast accessibility of LLMs, legal, social, and technical efforts are all needed to address concerns about LLMs to promote, improve, and regularize the application of LLMs in health care. ", doi="10.2196/22769", url="https://www.jmir.org/2024/1/e22769" } @Article{info:doi/10.2196/64593, author="Oliveira, Almeida Juliana and Eskandar, Karine and Kar, Emre and de Oliveira, Ribeiro Fl{\'a}via and Filho, Silva Agnaldo Lopes da", title="Understanding AI's Role in Endometriosis Patient Education and Evaluating Its Information and Accuracy: Systematic Review", journal="JMIR AI", year="2024", month="Oct", day="30", volume="3", pages="e64593", keywords="endometriosis", keywords="gynecology", keywords="machine learning", keywords="artificial intelligence", keywords="large language models", keywords="natural language processing", keywords="patient-generated health data", keywords="health knowledge", keywords="information seeking", keywords="patient education", abstract="Background: Endometriosis is a chronic gynecological condition that affects a significant portion of women of reproductive age, leading to debilitating symptoms such as chronic pelvic pain and infertility. Despite advancements in diagnosis and management, patient education remains a critical challenge. With the rapid growth of digital platforms, artificial intelligence (AI) has emerged as a potential tool to enhance patient education and access to information. Objective: This systematic review aims to explore the role of AI in facilitating education and improving information accessibility for individuals with endometriosis. Methods: This review followed the Preferred Reporting Items for Systematic reviews and Meta-Analyses (PRISMA) guidelines to ensure rigorous and transparent reporting. We conducted a comprehensive search of PubMed; Embase; the Regional Online Information System for Scientific Journals of Latin America, the Caribbean, Spain and Portugal (LATINDEX); Latin American and Caribbean Literature in Health Sciences (LILACS); Institute of Electrical and Electronics Engineers (IEEE) Xplore, and the Cochrane Central Register of Controlled Trials using the terms ``endometriosis'' and ``artificial intelligence.'' Studies were selected based on their focus on AI applications in patient education or information dissemination regarding endometriosis. We included studies that evaluated AI-driven tools for assessing patient knowledge and addressed frequently asked questions related to endometriosis. Data extraction and quality assessment were conducted independently by 2 authors, with discrepancies resolved through consensus. Results: Out of 400 initial search results, 11 studies met the inclusion criteria and were fully reviewed. We ultimately included 3 studies, 1 of which was an abstract. The studies examined the use of AI models, such as ChatGPT (OpenAI), machine learning, and natural language processing, in providing educational resources and answering common questions about endometriosis. The findings indicated that AI tools, particularly large language models, offer accurate responses to frequently asked questions with varying degrees of sufficiency across different categories. AI's integration with social media platforms also highlights its potential to identify patients' needs and enhance information dissemination. Conclusions: AI holds promise in advancing patient education and information access for endometriosis, providing accurate and comprehensive answers to common queries, and facilitating a better understanding of the condition. However, challenges remain in ensuring ethical use, equitable access, and maintaining accuracy across diverse patient populations. Future research should focus on developing standardized approaches for evaluating AI's impact on patient education and exploring its integration into clinical practice to enhance support for individuals with endometriosis. ", doi="10.2196/64593", url="https://ai.jmir.org/2024/1/e64593" } @Article{info:doi/10.2196/53207, author="Rosenbacke, Rikard and Melhus, {\AA}sa and McKee, Martin and Stuckler, David", title="How Explainable Artificial Intelligence Can Increase or Decrease Clinicians' Trust in AI Applications in Health Care: Systematic Review", journal="JMIR AI", year="2024", month="Oct", day="30", volume="3", pages="e53207", keywords="explainable artificial intelligence", keywords="XAI", keywords="trustworthy AI", keywords="clinician trust", keywords="affect-based measures", keywords="cognitive measures", keywords="clinical use", keywords="clinical decision-making", keywords="clinical informatics", abstract="Background: Artificial intelligence (AI) has significant potential in clinical practice. However, its ``black box'' nature can lead clinicians to question its value. The challenge is to create sufficient trust for clinicians to feel comfortable using AI, but not so much that they defer to it even when it produces results that conflict with their clinical judgment in ways that lead to incorrect decisions. Explainable AI (XAI) aims to address this by providing explanations of how AI algorithms reach their conclusions. However, it remains unclear whether such explanations foster an appropriate degree of trust to ensure the optimal use of AI in clinical practice. Objective: This study aims to systematically review and synthesize empirical evidence on the impact of XAI on clinicians' trust in AI-driven clinical decision-making. Methods: A systematic review was conducted in accordance with PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines, searching PubMed and Web of Science databases. Studies were included if they empirically measured the impact of XAI on clinicians' trust using cognition- or affect-based measures. Out of 778 articles screened, 10 met the inclusion criteria. We assessed the risk of bias using standard tools appropriate to the methodology of each paper. Results: The risk of bias in all papers was moderate or moderate to high. All included studies operationalized trust primarily through cognitive-based definitions, with 2 also incorporating affect-based measures. Out of these, 5 studies reported that XAI increased clinicians' trust compared with standard AI, particularly when the explanations were clear, concise, and relevant to clinical practice. In addition, 3 studies found no significant effect of XAI on trust, and the presence of explanations does not automatically improve trust. Notably, 2 studies highlighted that XAI could either enhance or diminish trust, depending on the complexity and coherence of the provided explanations. The majority of studies suggest that XAI has the potential to enhance clinicians' trust in recommendations generated by AI. However, complex or contradictory explanations can undermine this trust. More critically, trust in AI is not inherently beneficial, as AI recommendations are not infallible. These findings underscore the nuanced role of explanation quality and suggest that trust can be modulated through the careful design of XAI systems. Conclusions: Excessive trust in incorrect advice generated by AI can adversely impact clinical accuracy, just as can happen when correct advice is distrusted. Future research should focus on refining both cognitive and affect-based measures of trust and on developing strategies to achieve an appropriate balance in terms of trust, preventing both blind trust and undue skepticism. Optimizing trust in AI systems is essential for their effective integration into clinical practice. ", doi="10.2196/53207", url="https://ai.jmir.org/2024/1/e53207" } @Article{info:doi/10.2196/54710, author="Alam, Ashraful Md and Sajib, Zaman Md Refat Uz and Rahman, Fariya and Ether, Saraban and Hanson, Molly and Sayeed, Abu and Akter, Ema and Nusrat, Nowrin and Islam, Tahrin Tanjeena and Raza, Sahar and Tanvir, M. K. and Chisti, Jobayer Mohammod and Rahman, Sadeq-ur Qazi and Hossain, Akm and Layek, MA and Zaman, Asaduz and Rana, Juwel and Rahman, Moshfiqur Syed and Arifeen, El Shams and Rahman, Ehsanur Ahmed and Ahmed, Anisuddin", title="Implications of Big Data Analytics, AI, Machine Learning, and Deep Learning in the Health Care System of Bangladesh: Scoping Review", journal="J Med Internet Res", year="2024", month="Oct", day="28", volume="26", pages="e54710", keywords="machine learning", keywords="deep learning", keywords="artificial intelligence", keywords="big data analytics", keywords="public health", keywords="health care", keywords="mobile phone", keywords="Bangladesh", abstract="Background: The rapid advancement of digital technologies, particularly in big data analytics (BDA), artificial intelligence (AI), machine learning (ML), and deep learning (DL), is reshaping the global health care system, including in Bangladesh. The increased adoption of these technologies in health care delivery within Bangladesh has sparked their integration into health care and public health research, resulting in a noticeable surge in related studies. However, a critical gap exists, as there is a lack of comprehensive evidence regarding the research landscape; regulatory challenges; use cases; and the application and adoption of BDA, AI, ML, and DL in the health care system of Bangladesh. This gap impedes the attainment of optimal results. As Bangladesh is a leading implementer of digital technologies, bridging this gap is urgent for the effective use of these advancing technologies. Objective: This scoping review aims to collate (1) the existing research in Bangladesh's health care system, using the aforementioned technologies and synthesizing their findings, and (2) the limitations faced by researchers in integrating the aforementioned technologies into health care research. Methods: MEDLINE (via PubMed), IEEE Xplore, Scopus, and Embase databases were searched to identify published research articles between January 1, 2000, and September 10, 2023, meeting the following inclusion criteria: (1) any study using any of the BDA, AI, ML, and DL technologies and health care and public health datasets for predicting health issues and forecasting any kind of outbreak; (2) studies primarily focusing on health care and public health issues in Bangladesh; and (3) original research articles published in peer-reviewed journals and conference proceedings written in English. Results: With the initial search, we identified 1653 studies. Following the inclusion and exclusion criteria and full-text review, 4.66\% (77/1653) of the articles were finally included in this review. There was a substantial increase in studies over the last 5 years (2017-2023). Among the 77 studies, the majority (n=65, 84\%) used ML models. A smaller proportion of studies incorporated AI (4/77, 5\%), DL (7/77, 9\%), and BDA (1/77, 1\%) technologies. Among the reviewed articles, 52\% (40/77) relied on primary data, while the remaining 48\% (37/77) used secondary data. The primary research areas of focus were infectious diseases (15/77, 19\%), noncommunicable diseases (23/77, 30\%), child health (11/77, 14\%), and mental health (9/77, 12\%). Conclusions: This scoping review highlights remarkable progress in leveraging BDA, AI, ML, and DL within Bangladesh's health care system. The observed surge in studies over the last 5 years underscores the increasing significance of AI and related technologies in health care research. Notably, most (65/77, 84\%) studies focused on ML models, unveiling opportunities for advancements in predictive modeling. This review encapsulates the current state of technological integration and propels us into a promising era for the future of digital Bangladesh. ", doi="10.2196/54710", url="https://www.jmir.org/2024/1/e54710", url="http://www.ncbi.nlm.nih.gov/pubmed/39466315" } @Article{info:doi/10.2196/53488, author="Deng, Tianjie and Urbaczewski, Andrew and Lee, Jin Young and Barman-Adhikari, Anamika and Dewri, Rinku", title="Identifying Marijuana Use Behaviors Among Youth Experiencing Homelessness Using a Machine Learning--Based Framework: Development and Evaluation Study", journal="JMIR AI", year="2024", month="Oct", day="17", volume="3", pages="e53488", keywords="machine learning", keywords="youth experiencing homelessness", keywords="natural language processing", keywords="infodemiology", keywords="social good", keywords="digital intervention", abstract="Background: Youth experiencing homelessness face substance use problems disproportionately compared to other youth. A study found that 69\% of youth experiencing homelessness meet the criteria for dependence on at least 1 substance, compared to 1.8\% for all US adolescents. In addition, they experience major structural and social inequalities, which further undermine their ability to receive the care they need. Objective: The goal of this study was to develop a machine learning--based framework that uses the social media content (posts and interactions) of youth experiencing homelessness to predict their substance use behaviors (ie, the probability of using marijuana). With this framework, social workers and care providers can identify and reach out to youth experiencing homelessness who are at a higher risk of substance use. Methods: We recruited 133 young people experiencing homelessness at a nonprofit organization located in a city in the western United States. After obtaining their consent, we collected the participants' social media conversations for the past year before they were recruited, and we asked the participants to complete a survey on their demographic information, health conditions, sexual behaviors, and substance use behaviors. Building on the social sharing of emotions theory and social support theory, we identified important features that can potentially predict substance use. Then, we used natural language processing techniques to extract such features from social media conversations and reactions and built a series of machine learning models to predict participants' marijuana use. Results: We evaluated our models based on their predictive performance as well as their conformity with measures of fairness. Without predictive features from survey information, which may introduce sex and racial biases, our machine learning models can reach an area under the curve of 0.72 and an accuracy of 0.81 using only social media data when predicting marijuana use. We also evaluated the false-positive rate for each sex and age segment. Conclusions: We showed that textual interactions among youth experiencing homelessness and their friends on social media can serve as a powerful resource to predict their substance use. The framework we developed allows care providers to allocate resources efficiently to youth experiencing homelessness in the greatest need while costing minimal overhead. It can be extended to analyze and predict other health-related behaviors and conditions observed in this vulnerable community. ", doi="10.2196/53488", url="https://ai.jmir.org/2024/1/e53488" } @Article{info:doi/10.2196/52974, author="Harrison, M. Rachel and Lapteva, Ekaterina and Bibin, Anton", title="Behavioral Nudging With Generative AI for Content Development in SMS Health Care Interventions: Case Study", journal="JMIR AI", year="2024", month="Oct", day="15", volume="3", pages="e52974", keywords="generative artificial intelligence", keywords="generative AI", keywords="prompt engineering", keywords="large language models", keywords="GPT", keywords="content design", keywords="brief message interventions", keywords="mHealth", keywords="behavior change techniques", keywords="medication adherence", keywords="type 2 diabetes", abstract="Background: Brief message interventions have demonstrated immense promise in health care, yet the development of these messages has suffered from a dearth of transparency and a scarcity of publicly accessible data sets. Moreover, the researcher-driven content creation process has raised resource allocation issues, necessitating a more efficient and transparent approach to content development. Objective: This research sets out to address the challenges of content development for SMS interventions by showcasing the use of generative artificial intelligence (AI) as a tool for content creation, transparently explaining the prompt design and content generation process, and providing the largest publicly available data set of brief messages and source code for future replication of our process. Methods: Leveraging the pretrained large language model GPT-3.5 (OpenAI), we generate a collection of messages in the context of medication adherence for individuals with type 2 diabetes using evidence-derived behavior change techniques identified in a prior systematic review. We create an attributed prompt designed to adhere to content (readability and tone) and SMS (character count and encoder type) standards while encouraging message variability to reflect differences in behavior change techniques. Results: We deliver the most extensive repository of brief messages for a singular health care intervention and the first library of messages crafted with generative AI. In total, our method yields a data set comprising 1150 messages, with 89.91\% (n=1034) meeting character length requirements and 80.7\% (n=928) meeting readability requirements. Furthermore, our analysis reveals that all messages exhibit diversity comparable to an existing publicly available data set created under the same theoretical framework for a similar setting. Conclusions: This research provides a novel approach to content creation for health care interventions using state-of-the-art generative AI tools. Future research is needed to assess the generated content for ethical, safety, and research standards, as well as to determine whether the intervention is successful in improving the target behaviors. ", doi="10.2196/52974", url="https://ai.jmir.org/2024/1/e52974", url="http://www.ncbi.nlm.nih.gov/pubmed/39405108" } @Article{info:doi/10.2196/60589, author="Cross, Shane and Bell, Imogen and Nicholas, Jennifer and Valentine, Lee and Mangelsdorf, Shaminka and Baker, Simon and Titov, Nick and Alvarez-Jimenez, Mario", title="Use of AI in Mental Health Care: Community and Mental Health Professionals Survey", journal="JMIR Ment Health", year="2024", month="Oct", day="11", volume="11", pages="e60589", keywords="mental health", keywords="health care", keywords="AI", keywords="community members", keywords="mental health professional", keywords="web-based survey", keywords="Australia", keywords="descriptive statistic", keywords="thematic analysis", keywords="cost reduction", keywords="data security", keywords="digital health", keywords="digital intervention", keywords="artificial intelligence", abstract="Background: Artificial intelligence (AI) has been increasingly recognized as a potential solution to address mental health service challenges by automating tasks and providing new forms of support. Objective: This study is the first in a series which aims to estimate the current rates of AI technology use as well as perceived benefits, harms, and risks experienced by community members (CMs) and mental health professionals (MHPs). Methods: This study involved 2 web-based surveys conducted in Australia. The surveys collected data on demographics, technology comfort, attitudes toward AI, specific AI use cases, and experiences of benefits and harms from AI use. Descriptive statistics were calculated, and thematic analysis of open-ended responses were conducted. Results: The final sample consisted of 107 CMs and 86 MHPs. General attitudes toward AI varied, with CMs reporting neutral and MHPs reporting more positive attitudes. Regarding AI usage, 28\% (30/108) of CMs used AI, primarily for quick support (18/30, 60\%) and as a personal therapist (14/30, 47\%). Among MHPs, 43\% (37/86) used AI; mostly for research (24/37, 65\%) and report writing (20/37, 54\%). While the majority found AI to be generally beneficial (23/30, 77\% of CMs and 34/37, 92\% of MHPs), specific harms and concerns were experienced by 47\% (14/30) of CMs and 51\% (19/37) of MHPs. There was an equal mix of positive and negative sentiment toward the future of AI in mental health care in open feedback. Conclusions: Commercial AI tools are increasingly being used by CMs and MHPs. Respondents believe AI will offer future advantages for mental health care in terms of accessibility, cost reduction, personalization, and work efficiency. However, they were equally concerned about reducing human connection, ethics, privacy and regulation, medical errors, potential for misuse, and data security. Despite the immense potential, integration into mental health systems must be approached with caution, addressing legal and ethical concerns while developing safeguards to mitigate potential harms. Future surveys are planned to track use and acceptability of AI and associated issues over time. ", doi="10.2196/60589", url="https://mental.jmir.org/2024/1/e60589" } @Article{info:doi/10.2196/49546, author="Agmon, Shunit and Singer, Uriel and Radinsky, Kira", title="Leveraging Temporal Trends for Training Contextual Word Embeddings to Address Bias in Biomedical Applications: Development Study", journal="JMIR AI", year="2024", month="Oct", day="2", volume="3", pages="e49546", keywords="natural language processing", keywords="NLP", keywords="BERT", keywords="word embeddings", keywords="statistical models", keywords="bias", keywords="algorithms", keywords="gender", abstract="Background: Women have been underrepresented in clinical trials for many years. Machine-learning models trained on clinical trial abstracts may capture and amplify biases in the data. Specifically, word embeddings are models that enable representing words as vectors and are the building block of most natural language processing systems. If word embeddings are trained on clinical trial abstracts, predictive models that use the embeddings will exhibit gender performance gaps. Objective: We aim to capture temporal trends in clinical trials through temporal distribution matching on contextual word embeddings (specifically, BERT) and explore its effect on the bias manifested in downstream tasks. Methods: We present TeDi-BERT, a method to harness the temporal trend of increasing women's inclusion in clinical trials to train contextual word embeddings. We implement temporal distribution matching through an adversarial classifier, trying to distinguish old from new clinical trial abstracts based on their embeddings. The temporal distribution matching acts as a form of domain adaptation from older to more recent clinical trials. We evaluate our model on 2 clinical tasks: prediction of unplanned readmission to the intensive care unit and hospital length of stay prediction. We also conduct an algorithmic analysis of the proposed method. Results: In readmission prediction, TeDi-BERT achieved area under the receiver operating characteristic curve of 0.64 for female patients versus the baseline of 0.62 (P<.001), and 0.66 for male patients versus the baseline of 0.64 (P<.001). In the length of stay regression, TeDi-BERT achieved a mean absolute error of 4.56 (95\% CI 4.44-4.68) for female patients versus 4.62 (95\% CI 4.50-4.74, P<.001) and 4.54 (95\% CI 4.44-4.65) for male patients versus 4.6 (95\% CI 4.50-4.71, P<.001). Conclusions: In both clinical tasks, TeDi-BERT improved performance for female patients, as expected; but it also improved performance for male patients. Our results show that accuracy for one gender does not need to be exchanged for bias reduction, but rather that good science improves clinical results for all. Contextual word embedding models trained to capture temporal trends can help mitigate the effects of bias that changes over time in the training data. ", doi="10.2196/49546", url="https://ai.jmir.org/2024/1/e49546" } @Article{info:doi/10.2196/60020, author="van Buchem, Meija Marieke and Kant, J. Ilse M. and King, Liza and Kazmaier, Jacqueline and Steyerberg, W. Ewout and Bauer, P. Martijn", title="Impact of a Digital Scribe System on Clinical Documentation Time and Quality: Usability Study", journal="JMIR AI", year="2024", month="Sep", day="23", volume="3", pages="e60020", keywords="large language model", keywords="large language models", keywords="LLM", keywords="LLMs", keywords="natural language processing", keywords="NLP", keywords="deep learning", keywords="pilot study", keywords="pilot studies", keywords="implementation", keywords="machine learning", keywords="ML", keywords="artificial intelligence", keywords="AI", keywords="algorithm", keywords="algorithms", keywords="model", keywords="models", keywords="analytics", keywords="practical model", keywords="practical models", keywords="automation", keywords="automate", keywords="documentation", keywords="documentation time", keywords="documentation quality", keywords="clinical documentation", abstract="Background: Physicians spend approximately half of their time on administrative tasks, which is one of the leading causes of physician burnout and decreased work satisfaction. The implementation of natural language processing--assisted clinical documentation tools may provide a solution. Objective: This study investigates the impact of a commercially available Dutch digital scribe system on clinical documentation efficiency and quality. Methods: Medical students with experience in clinical practice and documentation (n=22) created a total of 430 summaries of mock consultations and recorded the time they spent on this task. The consultations were summarized using 3 methods: manual summaries, fully automated summaries, and automated summaries with manual editing. We then randomly reassigned the summaries and evaluated their quality using a modified version of the Physician Documentation Quality Instrument (PDQI-9). We compared the differences between the 3 methods in descriptive statistics, quantitative text metrics (word count and lexical diversity), the PDQI-9, Recall-Oriented Understudy for Gisting Evaluation scores, and BERTScore. Results: The median time for manual summarization was 202 seconds against 186 seconds for editing an automatic summary. Without editing, the automatic summaries attained a poorer PDQI-9 score than manual summaries (median PDQI-9 score 25 vs 31, P<.001, ANOVA test). Automatic summaries were found to have higher word counts but lower lexical diversity than manual summaries (P<.001, independent t test). The study revealed variable impacts on PDQI-9 scores and summarization time across individuals. Generally, students viewed the digital scribe system as a potentially useful tool, noting its ease of use and time-saving potential, though some criticized the summaries for their greater length and rigid structure. Conclusions: This study highlights the potential of digital scribes in improving clinical documentation processes by offering a first summary draft for physicians to edit, thereby reducing documentation time without compromising the quality of patient records. Furthermore, digital scribes may be more beneficial to some physicians than to others and could play a role in improving the reusability of clinical documentation. Future studies should focus on the impact and quality of such a system when used by physicians in clinical practice. ", doi="10.2196/60020", url="https://ai.jmir.org/2024/1/e60020", url="http://www.ncbi.nlm.nih.gov/pubmed/39312397" } @Article{info:doi/10.2196/48588, author="Tao, Jinxin and Larson, G. Ramsey and Mintz, Yonatan and Alagoz, Oguzhan and Hoppe, K. Kara", title="Predictive Modeling of Hypertension-Related Postpartum Readmission: Retrospective Cohort Analysis", journal="JMIR AI", year="2024", month="Sep", day="13", volume="3", pages="e48588", keywords="pregnancy", keywords="postpartum", keywords="hypertension", keywords="preeclampsia", keywords="blood pressure", keywords="hospital readmission, clinical calculator", keywords="healthcare cost", keywords="cost", keywords="cohort analysis", keywords="utilization", keywords="resources", keywords="labor", keywords="women", keywords="risk", keywords="readmission", keywords="cohort", keywords="hospital", keywords="statistical model", keywords="retrospective cohort study", keywords="predict", abstract="Background: Hypertension is the most common reason for postpartum hospital readmission. Better prediction of postpartum readmission will improve the health care of patients. These models will allow better use of resources and decrease health care costs. Objective: This study aimed to evaluate clinical predictors of postpartum readmission for hypertension using a novel machine learning (ML) model that can effectively predict readmissions and balance treatment costs. We examined whether blood pressure and other measures during labor, not just postpartum measures, would be important predictors of readmission. Methods: We conducted a retrospective cohort study from the PeriData website data set from a single midwestern academic center of all women who delivered from 2009 to 2018. This study consists of 2 data sets; 1 spanning the years 2009-2015 and the other spanning the years 2016-2018. A total of 47 clinical and demographic variables were collected including blood pressure measurements during labor and post partum, laboratory values, and medication administration. Hospital readmissions were verified by patient chart review. In total, 32,645 were considered in the study. For our analysis, we trained several cost-sensitive ML models to predict the primary outcome of hypertension-related postpartum readmission within 42 days post partum. Models were evaluated using cross-validation and on independent data sets (models trained on data from 2009 to 2015 were validated on the data from 2016 to 2018). To assess clinical viability, a cost analysis of the models was performed to see how their recommendations could affect treatment costs. Results: Of the 32,645 patients included in the study, 170 were readmitted due to a hypertension-related diagnosis. A cost-sensitive random forest method was found to be the most effective with a balanced accuracy of 76.61\% for predicting readmission. Using a feature importance and area under the curve analysis, the most important variables for predicting readmission were blood pressures in labor and 24-48 hours post partum increasing the area under the curve of the model from 0.69 (SD 0.06) to 0.81 (SD 0.06), (P=.05). Cost analysis showed that the resulting model could have reduced associated readmission costs by US \$6000 against comparable models with similar F1-score and balanced accuracy. The most effective model was then implemented as a risk calculator that is publicly available. The code for this calculator and the model is also publicly available at a GitHub repository. Conclusions: Blood pressure measurements during labor through 48 hours post partum can be combined with other variables to predict women at risk for postpartum readmission. Using ML techniques in conjunction with these data have the potential to improve health outcomes and reduce associated costs. The use of the calculator can greatly assist clinicians in providing care to patients and improve medical decision-making. ", doi="10.2196/48588", url="https://ai.jmir.org/2024/1/e48588" } @Article{info:doi/10.2196/54449, author="Khademi, Sedigh and Palmer, Christopher and Javed, Muhammad and Dimaguila, Luis Gerardo and Clothier, Hazel and Buttery, Jim and Black, Jim", title="Near Real-Time Syndromic Surveillance of Emergency Department Triage Texts Using Natural Language Processing: Case Study in Febrile Convulsion Detection", journal="JMIR AI", year="2024", month="Aug", day="30", volume="3", pages="e54449", keywords="vaccine safety", keywords="immunization", keywords="febrile convulsion", keywords="syndromic surveillance", keywords="emergency department", keywords="natural language processing", abstract="Background: Collecting information on adverse events following immunization from as many sources as possible is critical for promptly identifying potential safety concerns and taking appropriate actions. Febrile convulsions are recognized as an important potential reaction to vaccination in children aged <6 years. Objective: The primary aim of this study was to evaluate the performance of natural language processing techniques and machine learning (ML) models for the rapid detection of febrile convulsion presentations in emergency departments (EDs), especially with respect to the minimum training data requirements to obtain optimum model performance. In addition, we examined the deployment requirements for a ML model to perform real-time monitoring of ED triage notes. Methods: We developed a pattern matching approach as a baseline and evaluated ML models for the classification of febrile convulsions in ED triage notes to determine both their training requirements and their effectiveness in detecting febrile convulsions. We measured their performance during training and then compared the deployed models' result on new incoming ED data. Results: Although the best standard neural networks had acceptable performance and were low-resource models, transformer-based models outperformed them substantially, justifying their ongoing deployment. Conclusions: Using natural language processing, particularly with the use of large language models, offers significant advantages in syndromic surveillance. Large language models make highly effective classifiers, and their text generation capacity can be used to enhance the quality and diversity of training data. ", doi="10.2196/54449", url="https://ai.jmir.org/2024/1/e54449" } @Article{info:doi/10.2196/57983, author="Ojha, Tanvi and Patel, Atushi and Sivapragasam, Krishihan and Sharma, Radha and Vosoughi, Tina and Skidmore, Becky and Pinto, D. Andrew and Hosseini, Banafshe", title="Exploring Machine Learning Applications in Pediatric Asthma Management: Scoping Review", journal="JMIR AI", year="2024", month="Aug", day="27", volume="3", pages="e57983", keywords="pediatric asthma", keywords="machine learning", keywords="predictive modeling", keywords="asthma management", keywords="exacerbation", keywords="artificial intelligence", abstract="Background: The integration of machine learning (ML) in predicting asthma-related outcomes in children presents a novel approach in pediatric health care. Objective: This scoping review aims to analyze studies published since 2019, focusing on ML algorithms, their applications, and predictive performances. Methods: We searched Ovid MEDLINE ALL and Embase on Ovid, the Cochrane Library (Wiley), CINAHL (EBSCO), and Web of Science (core collection). The search covered the period from January 1, 2019, to July 18, 2023. Studies applying ML models in predicting asthma-related outcomes in children aged <18 years were included. Covidence was used for citation management, and the risk of bias was assessed using the Prediction Model Risk of Bias Assessment Tool. Results: From 1231 initial articles, 15 met our inclusion criteria. The sample size ranged from 74 to 87,413 patients. Most studies used multiple ML techniques, with logistic regression (n=7, 47\%) and random forests (n=6, 40\%) being the most common. Key outcomes included predicting asthma exacerbations, classifying asthma phenotypes, predicting asthma diagnoses, and identifying potential risk factors. For predicting exacerbations, recurrent neural networks and XGBoost showed high performance, with XGBoost achieving an area under the receiver operating characteristic curve (AUROC) of 0.76. In classifying asthma phenotypes, support vector machines were highly effective, achieving an AUROC of 0.79. For diagnosis prediction, artificial neural networks outperformed logistic regression, with an AUROC of 0.63. To identify risk factors focused on symptom severity and lung function, random forests achieved an AUROC of 0.88. Sound-based studies distinguished wheezing from nonwheezing and asthmatic from normal coughs. The risk of bias assessment revealed that most studies (n=8, 53\%) exhibited low to moderate risk, ensuring a reasonable level of confidence in the findings. Common limitations across studies included data quality issues, sample size constraints, and interpretability concerns. Conclusions: This review highlights the diverse application of ML in predicting pediatric asthma outcomes, with each model offering unique strengths and challenges. Future research should address data quality, increase sample sizes, and enhance model interpretability to optimize ML utility in clinical settings for pediatric asthma management. ", doi="10.2196/57983", url="https://ai.jmir.org/2024/1/e57983", url="http://www.ncbi.nlm.nih.gov/pubmed/39190449" } @Article{info:doi/10.2196/53506, author="Lange, Martin and L{\"o}we, Alexandra and Kayser, Ina and Schaller, Andrea", title="Approaches for the Use of AI in Workplace Health Promotion and Prevention: Systematic Scoping Review", journal="JMIR AI", year="2024", month="Aug", day="20", volume="3", pages="e53506", keywords="artificial intelligence", keywords="AI", keywords="machine learning", keywords="deep learning", keywords="workplace health promotion", keywords="prevention", keywords="workplace health promotion and prevention", keywords="technology", keywords="technologies", keywords="well-being", keywords="behavioral health", keywords="workplace-related", keywords="public health", keywords="biomedicine", keywords="PRISMA-ScR", keywords="Preferred Reporting Items for Systematic Reviews and Meta-Analyses Extension for Scoping Reviews", keywords="WHPP", keywords="risk", keywords="AI-algorithm", keywords="control group", keywords="accuracy", keywords="health-related", keywords="prototype", keywords="systematic review", keywords="scoping review", keywords="reviews", keywords="mobile phone", abstract="Background: Artificial intelligence (AI) is an umbrella term for various algorithms and rapidly emerging technologies with huge potential for workplace health promotion and prevention (WHPP). WHPP interventions aim to improve people's health and well-being through behavioral and organizational measures or by minimizing the burden of workplace-related diseases and associated risk factors. While AI has been the focus of research in other health-related fields, such as public health or biomedicine, the transition of AI into WHPP research has yet to be systematically investigated. Objective: The systematic scoping review aims to comprehensively assess an overview of the current use of AI in WHPP. The results will be then used to point to future research directions. The following research questions were derived: (1) What are the study characteristics of studies on AI algorithms and technologies in the context of WHPP? (2) What specific WHPP fields (prevention, behavioral, and organizational approaches) were addressed by the AI algorithms and technologies? (3) What kind of interventions lead to which outcomes? Methods: A systematic scoping literature review (PRISMA-ScR [Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews]) was conducted in the 3 academic databases PubMed, Institute of Electrical and Electronics Engineers, and Association for Computing Machinery in July 2023, searching for papers published between January 2000 and December 2023. Studies needed to be (1) peer-reviewed, (2) written in English, and (3) focused on any AI-based algorithm or technology that (4) were conducted in the context of WHPP or (5) an associated field. Information on study design, AI algorithms and technologies, WHPP fields, and the patient or population, intervention, comparison, and outcomes framework were extracted blindly with Rayyan and summarized. Results: A total of 10 studies were included. Risk prevention and modeling were the most identified WHPP fields (n=6), followed by behavioral health promotion (n=4) and organizational health promotion (n=1). Further, 4 studies focused on mental health. Most AI algorithms were machine learning-based, and 3 studies used combined deep learning algorithms. AI algorithms and technologies were primarily implemented in smartphone apps (eg, in the form of a chatbot) or used the smartphone as a data source (eg, Global Positioning System). Behavioral approaches ranged from 8 to 12 weeks and were compared to control groups. Additionally, 3 studies evaluated the robustness and accuracy of an AI model or framework. Conclusions: Although AI has caught increasing attention in health-related research, the review reveals that AI in WHPP is marginally investigated. Our results indicate that AI is promising for individualization and risk prediction in WHPP, but current research does not cover the scope of WHPP. Beyond that, future research will profit from an extended range of research in all fields of WHPP, longitudinal data, and reporting guidelines. Trial Registration: OSF Registries osf.io/bfswp; https://osf.io/bfswp ", doi="10.2196/53506", url="https://ai.jmir.org/2024/1/e53506", url="http://www.ncbi.nlm.nih.gov/pubmed/38989904" } @Article{info:doi/10.2196/56537, author="Mostafapour, Mehrnaz and Fortier, H. Jacqueline and Pacheco, Karen and Murray, Heather and Garber, Gary", title="Evaluating Literature Reviews Conducted by Humans Versus ChatGPT: Comparative Study", journal="JMIR AI", year="2024", month="Aug", day="19", volume="3", pages="e56537", keywords="OpenAIs", keywords="chatGPT", keywords="AI vs. human", keywords="literature search", keywords="Chat GPT performance evaluation", keywords="large language models", keywords="artificial intelligence", keywords="AI", keywords="algorithm", keywords="algorithms", keywords="predictive model", keywords="predictive models", keywords="literature review", keywords="literature reviews", abstract="Background: With the rapid evolution of artificial intelligence (AI), particularly large language models (LLMs) such as ChatGPT-4 (OpenAI), there is an increasing interest in their potential to assist in scholarly tasks, including conducting literature reviews. However, the efficacy of AI-generated reviews compared with traditional human-led approaches remains underexplored. Objective: This study aims to compare the quality of literature reviews conducted by the ChatGPT-4 model with those conducted by human researchers, focusing on the relational dynamics between physicians and patients. Methods: We included 2 literature reviews in the study on the same topic, namely, exploring factors affecting relational dynamics between physicians and patients in medicolegal contexts. One review used GPT-4, last updated in September 2021, and the other was conducted by human researchers. The human review involved a comprehensive literature search using medical subject headings and keywords in Ovid MEDLINE, followed by a thematic analysis of the literature to synthesize information from selected articles. The AI-generated review used a new prompt engineering approach, using iterative and sequential prompts to generate results. Comparative analysis was based on qualitative measures such as accuracy, response time, consistency, breadth and depth of knowledge, contextual understanding, and transparency. Results: GPT-4 produced an extensive list of relational factors rapidly. The AI model demonstrated an impressive breadth of knowledge but exhibited limitations in in-depth and contextual understanding, occasionally producing irrelevant or incorrect information. In comparison, human researchers provided a more nuanced and contextually relevant review. The comparative analysis assessed the reviews based on criteria including accuracy, response time, consistency, breadth and depth of knowledge, contextual understanding, and transparency. While GPT-4 showed advantages in response time and breadth of knowledge, human-led reviews excelled in accuracy, depth of knowledge, and contextual understanding. Conclusions: The study suggests that GPT-4, with structured prompt engineering, can be a valuable tool for conducting preliminary literature reviews by providing a broad overview of topics quickly. However, its limitations necessitate careful expert evaluation and refinement, making it an assistant rather than a substitute for human expertise in comprehensive literature reviews. Moreover, this research highlights the potential and limitations of using AI tools like GPT-4 in academic research, particularly in the fields of health services and medical research. It underscores the necessity of combining AI's rapid information retrieval capabilities with human expertise for more accurate and contextually rich scholarly outputs. ", doi="10.2196/56537", url="https://ai.jmir.org/2024/1/e56537" } @Article{info:doi/10.2196/49795, author="Lorenzini, Giorgia and Arbelaez Ossa, Laura and Milford, Stephen and Elger, Simone Bernice and Shaw, Martin David and De Clercq, Eva", title="The ``Magical Theory'' of AI in Medicine: Thematic Narrative Analysis", journal="JMIR AI", year="2024", month="Aug", day="19", volume="3", pages="e49795", keywords="artificial intelligence", keywords="medicine", keywords="physicians", keywords="hype", keywords="narratives", keywords="qualitative research", abstract="Background: The discourse surrounding medical artificial intelligence (AI) often focuses on narratives that either hype the technology's potential or predict dystopian futures. AI narratives have a significant influence on the direction of research, funding, and public opinion and thus shape the future of medicine. Objective: The paper aims to offer critical reflections on AI narratives, with a specific focus on medical AI, and to raise awareness as to how people working with medical AI talk about AI and discharge their ``narrative responsibility.'' Methods: Qualitative semistructured interviews were conducted with 41 participants from different disciplines who were exposed to medical AI in their profession. The research represents a secondary analysis of data using a thematic narrative approach. The analysis resulted in 2 main themes, each with 2 other subthemes. Results: Stories about the AI-physician interaction depicted either a competitive or collaborative relationship. Some participants argued that AI might replace physicians, as it performs better than physicians. However, others believed that physicians should not be replaced and that AI should rather assist and support physicians. The idea of excessive technological deferral and automation bias was discussed, highlighting the risk of ``losing'' decisional power. The possibility that AI could relieve physicians from burnout and allow them to spend more time with patients was also considered. Finally, a few participants reported an extremely optimistic account of medical AI, while the majority criticized this type of story. The latter lamented the existence of a ``magical theory'' of medical AI, identified with techno-solutionist positions. Conclusions: Most of the participants reported a nuanced view of technology, recognizing both its benefits and challenges and avoiding polarized narratives. However, some participants did contribute to the hype surrounding medical AI, comparing it to human capabilities and depicting it as superior. Overall, the majority agreed that medical AI should assist rather than replace clinicians. The study concludes that a balanced narrative (that focuses on the technology's present capabilities and limitations) is necessary to fully realize the potential of medical AI while avoiding unrealistic expectations and hype. ", doi="10.2196/49795", url="https://ai.jmir.org/2024/1/e49795", url="http://www.ncbi.nlm.nih.gov/pubmed/39158953" } @Article{info:doi/10.2196/54371, author="Spina, Aidin and Andalib, Saman and Flores, Daniel and Vermani, Rishi and Halaseh, F. Faris and Nelson, M. Ariana", title="Evaluation of Generative Language Models in Personalizing Medical Information: Instrument Validation Study", journal="JMIR AI", year="2024", month="Aug", day="13", volume="3", pages="e54371", keywords="generative language model", keywords="GLM", keywords="artificial intelligence", keywords="AI", keywords="low health literacy", keywords="LHL", keywords="readability", keywords="GLMs", keywords="language model", keywords="language models", keywords="health literacy", keywords="understandable", keywords="understandability", keywords="knowledge translation", keywords="comprehension", keywords="generative", keywords="NLP", keywords="natural language processing", keywords="reading level", keywords="reading levels", keywords="education", keywords="medical text", keywords="medical texts", keywords="medical information", keywords="health information", abstract="Background: Although uncertainties exist regarding implementation, artificial intelligence--driven generative language models (GLMs) have enormous potential in medicine. Deployment of GLMs could improve patient comprehension of clinical texts and improve low health literacy. Objective: The goal of this study is to evaluate the potential of ChatGPT-3.5 and GPT-4 to tailor the complexity of medical information to patient-specific input education level, which is crucial if it is to serve as a tool in addressing low health literacy. Methods: Input templates related to 2 prevalent chronic diseases---type II diabetes and hypertension---were designed. Each clinical vignette was adjusted for hypothetical patient education levels to evaluate output personalization. To assess the success of a GLM (GPT-3.5 and GPT-4) in tailoring output writing, the readability of pre- and posttransformation outputs were quantified using the Flesch reading ease score (FKRE) and the Flesch-Kincaid grade level (FKGL). Results: Responses (n=80) were generated using GPT-3.5 and GPT-4 across 2 clinical vignettes. For GPT-3.5, FKRE means were 57.75 (SD 4.75), 51.28 (SD 5.14), 32.28 (SD 4.52), and 28.31 (SD 5.22) for 6th grade, 8th grade, high school, and bachelor's, respectively; FKGL mean scores were 9.08 (SD 0.90), 10.27 (SD 1.06), 13.4 (SD 0.80), and 13.74 (SD 1.18). GPT-3.5 only aligned with the prespecified education levels at the bachelor's degree. Conversely, GPT-4's FKRE mean scores were 74.54 (SD 2.6), 71.25 (SD 4.96), 47.61 (SD 6.13), and 13.71 (SD 5.77), with FKGL mean scores of 6.3 (SD 0.73), 6.7 (SD 1.11), 11.09 (SD 1.26), and 17.03 (SD 1.11) for the same respective education levels. GPT-4 met the target readability for all groups except the 6th-grade FKRE average. Both GLMs produced outputs with statistically significant differences (P<.001; 8th grade P<.001; high school P<.001; bachelors P=.003; FKGL: 6th grade P=.001; 8th grade P<.001; high school P<.001; bachelors P<.001) between mean FKRE and FKGL across input education levels. Conclusions: GLMs can change the structure and readability of medical text outputs according to input-specified education. However, GLMs categorize input education designation into 3 broad tiers of output readability: easy (6th and 8th grade), medium (high school), and difficult (bachelor's degree). This is the first result to suggest that there are broader boundaries in the success of GLMs in output text simplification. Future research must establish how GLMs can reliably personalize medical texts to prespecified education levels to enable a broader impact on health care literacy. ", doi="10.2196/54371", url="https://ai.jmir.org/2024/1/e54371" } @Article{info:doi/10.2196/56932, author="Lu, Qiuhao and Wen, Andrew and Nguyen, Thien and Liu, Hongfang", title="Enhancing Clinical Relevance of Pretrained Language Models Through Integration of External Knowledge: Case Study on Cardiovascular Diagnosis From Electronic Health Records", journal="JMIR AI", year="2024", month="Aug", day="6", volume="3", pages="e56932", keywords="knowledge integration", keywords="pre-trained language models", keywords="physician reasoning", keywords="adapters", keywords="physician", keywords="physicians", keywords="electronic health record", keywords="electronic health records", keywords="EHR", keywords="healthcare", keywords="heterogeneous", keywords="healthcare institution", keywords="healthcare institutions", keywords="proprietary information", keywords="healthcare data", keywords="methodology", keywords="text classification", keywords="data privacy", keywords="medical knowledge", abstract="Background: Despite their growing use in health care, pretrained language models (PLMs) often lack clinical relevance due to insufficient domain expertise and poor interpretability. A key strategy to overcome these challenges is integrating external knowledge into PLMs, enhancing their adaptability and clinical usefulness. Current biomedical knowledge graphs like UMLS (Unified Medical Language System), SNOMED CT (Systematized Medical Nomenclature for Medicine--Clinical Terminology), and HPO (Human Phenotype Ontology), while comprehensive, fail to effectively connect general biomedical knowledge with physician insights. There is an equally important need for a model that integrates diverse knowledge in a way that is both unified and compartmentalized. This approach not only addresses the heterogeneous nature of domain knowledge but also recognizes the unique data and knowledge repositories of individual health care institutions, necessitating careful and respectful management of proprietary information. Objective: This study aimed to enhance the clinical relevance and interpretability of PLMs by integrating external knowledge in a manner that respects the diversity and proprietary nature of health care data. We hypothesize that domain knowledge, when captured and distributed as stand-alone modules, can be effectively reintegrated into PLMs to significantly improve their adaptability and utility in clinical settings. Methods: We demonstrate that through adapters, small and lightweight neural networks that enable the integration of extra information without full model fine-tuning, we can inject diverse sources of external domain knowledge into language models and improve the overall performance with an increased level of interpretability. As a practical application of this methodology, we introduce a novel task, structured as a case study, that endeavors to capture physician knowledge in assigning cardiovascular diagnoses from clinical narratives, where we extract diagnosis-comment pairs from electronic health records (EHRs) and cast the problem as text classification. Results: The study demonstrates that integrating domain knowledge into PLMs significantly improves their performance. While improvements with ClinicalBERT are more modest, likely due to its pretraining on clinical texts, BERT (bidirectional encoder representations from transformer) equipped with knowledge adapters surprisingly matches or exceeds ClinicalBERT in several metrics. This underscores the effectiveness of knowledge adapters and highlights their potential in settings with strict data privacy constraints. This approach also increases the level of interpretability of these models in a clinical context, which enhances our ability to precisely identify and apply the most relevant domain knowledge for specific tasks, thereby optimizing the model's performance and tailoring it to meet specific clinical needs. Conclusions: This research provides a basis for creating health knowledge graphs infused with physician knowledge, marking a significant step forward for PLMs in health care. Notably, the model balances integrating knowledge both comprehensively and selectively, addressing the heterogeneous nature of medical knowledge and the privacy needs of health care institutions. ", doi="10.2196/56932", url="https://ai.jmir.org/2024/1/e56932", url="http://www.ncbi.nlm.nih.gov/pubmed/39106099" } @Article{info:doi/10.2196/54482, author="Prescott, R. Maximo and Yeager, Samantha and Ham, Lillian and Rivera Saldana, D. Carlos and Serrano, Vanessa and Narez, Joey and Paltin, Dafna and Delgado, Jorge and Moore, J. David and Montoya, Jessica", title="Comparing the Efficacy and Efficiency of Human and Generative AI: Qualitative Thematic Analyses", journal="JMIR AI", year="2024", month="Aug", day="2", volume="3", pages="e54482", keywords="GenAI", keywords="generative artificial intelligence", keywords="ChatGPT", keywords="Bard", keywords="qualitative research", keywords="thematic analysis", keywords="digital health", abstract="Background: Qualitative methods are incredibly beneficial to the dissemination and implementation of new digital health interventions; however, these methods can be time intensive and slow down dissemination when timely knowledge from the data sources is needed in ever-changing health systems. Recent advancements in generative artificial intelligence (GenAI) and their underlying large language models (LLMs) may provide a promising opportunity to expedite the qualitative analysis of textual data, but their efficacy and reliability remain unknown. Objective: The primary objectives of our study were to evaluate the consistency in themes, reliability of coding, and time needed for inductive and deductive thematic analyses between GenAI (ie, ChatGPT and Bard) and human coders. Methods: The qualitative data for this study consisted of 40 brief SMS text message reminder prompts used in a digital health intervention for promoting antiretroviral medication adherence among people with HIV who use methamphetamine. Inductive and deductive thematic analyses of these SMS text messages were conducted by 2 independent teams of human coders. An independent human analyst conducted analyses following both approaches using ChatGPT and Bard. The consistency in themes (or the extent to which the themes were the same) and reliability (or agreement in coding of themes) between methods were compared. Results: The themes generated by GenAI (both ChatGPT and Bard) were consistent with 71\% (5/7) of the themes identified by human analysts following inductive thematic analysis. The consistency in themes was lower between humans and GenAI following a deductive thematic analysis procedure (ChatGPT: 6/12, 50\%; Bard: 7/12, 58\%). The percentage agreement (or intercoder reliability) for these congruent themes between human coders and GenAI ranged from fair to moderate (ChatGPT, inductive: 31/66, 47\%; ChatGPT, deductive: 22/59, 37\%; Bard, inductive: 20/54, 37\%; Bard, deductive: 21/58, 36\%). In general, ChatGPT and Bard performed similarly to each other across both types of qualitative analyses in terms of consistency of themes (inductive: 6/6, 100\%; deductive: 5/6, 83\%) and reliability of coding (inductive: 23/62, 37\%; deductive: 22/47, 47\%). On average, GenAI required significantly less overall time than human coders when conducting qualitative analysis (20, SD 3.5 min vs 567, SD 106.5 min). Conclusions: The promising consistency in the themes generated by human coders and GenAI suggests that these technologies hold promise in reducing the resource intensiveness of qualitative thematic analysis; however, the relatively lower reliability in coding between them suggests that hybrid approaches are necessary. Human coders appeared to be better than GenAI at identifying nuanced and interpretative themes. Future studies should consider how these powerful technologies can be best used in collaboration with human coders to improve the efficiency of qualitative research in hybrid approaches while also mitigating potential ethical risks that they may pose. ", doi="10.2196/54482", url="https://ai.jmir.org/2024/1/e54482" } @Article{info:doi/10.2196/52500, author="Hodson, Nathan and Williamson, Simon", title="Can Large Language Models Replace Therapists? Evaluating Performance at Simple Cognitive Behavioral Therapy Tasks", journal="JMIR AI", year="2024", month="Jul", day="30", volume="3", pages="e52500", keywords="mental health", keywords="psychotherapy", keywords="digital therapy", keywords="CBT", keywords="ChatGPT", keywords="cognitive behavioral therapy", keywords="cognitive behavioural therapy", keywords="LLM", keywords="LLMs", keywords="language model", keywords="language models", keywords="NLP", keywords="natural language processing", keywords="artificial intelligence", keywords="performance", keywords="chatbot", keywords="chatbots", keywords="conversational agent", keywords="conversational agents", doi="10.2196/52500", url="https://ai.jmir.org/2024/1/e52500", url="http://www.ncbi.nlm.nih.gov/pubmed/39078696" } @Article{info:doi/10.2196/46871, author="Han, Yu and Ceross, Aaron and Bergmann, Jeroen", title="Regulatory Frameworks for AI-Enabled Medical Device Software in China: Comparative Analysis and Review of Implications for Global Manufacturer", journal="JMIR AI", year="2024", month="Jul", day="29", volume="3", pages="e46871", keywords="NMPA", keywords="medical device software", keywords="device registration", keywords="registration pathway", keywords="artificial intelligence", keywords="machine learning", keywords="medical device", keywords="device development", keywords="China", keywords="regulations", keywords="medical software", doi="10.2196/46871", url="https://ai.jmir.org/2024/1/e46871", url="http://www.ncbi.nlm.nih.gov/pubmed/39073860" } @Article{info:doi/10.2196/54885, author="Huang, Jingyi and Guo, Peiqi and Zhang, Sheng and Ji, Mengmeng and An, Ruopeng", title="Use of Deep Neural Networks to Predict Obesity With Short Audio Recordings: Development and Usability Study", journal="JMIR AI", year="2024", month="Jul", day="25", volume="3", pages="e54885", keywords="obesity", keywords="obese", keywords="overweight", keywords="voice", keywords="vocal", keywords="vocal cord", keywords="vocal cords", keywords="voice-based", keywords="machine learning", keywords="ML", keywords="artificial intelligence", keywords="AI", keywords="algorithm", keywords="algorithms", keywords="predictive model", keywords="predictive models", keywords="predictive analytics", keywords="predictive system", keywords="practical model", keywords="practical models", keywords="early warning", keywords="early detection", keywords="deep neural network", keywords="deep neural networks", keywords="DNN", keywords="artificial neural network", keywords="artificial neural networks", keywords="deep learning", abstract="Background: The escalating global prevalence of obesity has necessitated the exploration of novel diagnostic approaches. Recent scientific inquiries have indicated potential alterations in voice characteristics associated with obesity, suggesting the feasibility of using voice as a noninvasive biomarker for obesity detection. Objective: This study aims to use deep neural networks to predict obesity status through the analysis of short audio recordings, investigating the relationship between vocal characteristics and obesity. Methods: A pilot study was conducted with 696 participants, using self-reported BMI to classify individuals into obesity and nonobesity groups. Audio recordings of participants reading a short script were transformed into spectrograms and analyzed using an adapted YOLOv8 model (Ultralytics). The model performance was evaluated using accuracy, recall, precision, and F1-scores. Results: The adapted YOLOv8 model demonstrated a global accuracy of 0.70 and a macro F1-score of 0.65. It was more effective in identifying nonobesity (F1-score of 0.77) than obesity (F1-score of 0.53). This moderate level of accuracy highlights the potential and challenges in using vocal biomarkers for obesity detection. Conclusions: While the study shows promise in the field of voice-based medical diagnostics for obesity, it faces limitations such as reliance on self-reported BMI data and a small, homogenous sample size. These factors, coupled with variability in recording quality, necessitate further research with more robust methodologies and diverse samples to enhance the validity of this novel approach. The findings lay a foundational step for future investigations in using voice as a noninvasive biomarker for obesity detection. ", doi="10.2196/54885", url="https://ai.jmir.org/2024/1/e54885" } @Article{info:doi/10.2196/56700, author="Kurasawa, Hisashi and Waki, Kayo and Seki, Tomohisa and Chiba, Akihiro and Fujino, Akinori and Hayashi, Katsuyoshi and Nakahara, Eri and Haga, Tsuneyuki and Noguchi, Takashi and Ohe, Kazuhiko", title="Enhancing Type 2 Diabetes Treatment Decisions With Interpretable Machine Learning Models for Predicting Hemoglobin A1c Changes: Machine Learning Model Development", journal="JMIR AI", year="2024", month="Jul", day="18", volume="3", pages="e56700", keywords="AI", keywords="artificial intelligence", keywords="attention weight", keywords="type 2 diabetes", keywords="blood glucose control", keywords="machine learning", keywords="transformer", abstract="Background: Type 2 diabetes (T2D) is a significant global health challenge. Physicians need to assess whether future glycemic control will be poor on the current trajectory of usual care and usual-care treatment intensifications so that they can consider taking extra treatment measures to prevent poor outcomes. Predicting poor glycemic control from trends in hemoglobin A1c (HbA1c) levels is difficult due to the influence of seasonal fluctuations and other factors. Objective: We sought to develop a model that accurately predicts poor glycemic control among patients with T2D receiving usual care. Methods: Our machine learning model predicts poor glycemic control (HbA1c?8\%) using the transformer architecture, incorporating an attention mechanism to process irregularly spaced HbA1c time series and quantify temporal relationships of past HbA1c levels at each time point. We assessed the model using HbA1c levels from 7787 patients with T2D seeing specialist physicians at the University of Tokyo Hospital. The training data include instances of poor glycemic control occurring during usual care with usual-care treatment intensifications. We compared prediction accuracy, assessed with the area under the receiver operating characteristic curve, the area under the precision-recall curve, and the accuracy rate, to that of LightGBM. Results: The area under the receiver operating characteristic curve, the area under the precision-recall curve, and the accuracy rate (95\% confidence limits) of the proposed model were 0.925 (95\% CI 0.923-0.928), 0.864 (95\% CI 0.852-0.875), and 0.864 (95\% CI 0.86-0.869), respectively. The proposed model achieved high prediction accuracy comparable to or surpassing LightGBM's performance. The model prioritized the most recent HbA1c levels for predictions. Older HbA1c levels in patients with poor glycemic control were slightly more influential in predictions compared to patients with good glycemic control. Conclusions: The proposed model accurately predicts poor glycemic control for patients with T2D receiving usual care, including patients receiving usual-care treatment intensifications, allowing physicians to identify cases warranting extraordinary treatment intensifications. If used by a nonspecialist, the model's indication of likely future poor glycemic control may warrant a referral to a specialist. Future efforts could incorporate diverse and large-scale clinical data for improved accuracy. ", doi="10.2196/56700", url="https://ai.jmir.org/2024/1/e56700", url="http://www.ncbi.nlm.nih.gov/pubmed/39024008" } @Article{info:doi/10.2196/54798, author="De Souza, Jessica and Viswanath, Kumar Varun and Echterhoff, Maria Jessica and Chamberlain, Kristina and Wang, Jay Edward", title="Augmenting Telepostpartum Care With Vision-Based Detection of Breastfeeding-Related Conditions: Algorithm Development and Validation", journal="JMIR AI", year="2024", month="Jun", day="24", volume="3", pages="e54798", keywords="remote consultations", keywords="artificial intelligence", keywords="AI for health care", keywords="deep learning", keywords="detection model", keywords="breastfeeding", keywords="telehealth", keywords="perinatal health", keywords="image analysis", keywords="women's health", keywords="mobile phone", abstract="Background: Breastfeeding benefits both the mother and infant and is a topic of attention in public health. After childbirth, untreated medical conditions or lack of support lead many mothers to discontinue breastfeeding. For instance, nipple damage and mastitis affect 80\% and 20\% of US mothers, respectively. Lactation consultants (LCs) help mothers with breastfeeding, providing in-person, remote, and hybrid lactation support. LCs guide, encourage, and find ways for mothers to have a better experience breastfeeding. Current telehealth services help mothers seek LCs for breastfeeding support, where images help them identify and address many issues. Due to the disproportional ratio of LCs and mothers in need, these professionals are often overloaded and burned out. Objective: This study aims to investigate the effectiveness of 5 distinct convolutional neural networks in detecting healthy lactating breasts and 6 breastfeeding-related issues by only using red, green, and blue images. Our goal was to assess the applicability of this algorithm as an auxiliary resource for LCs to identify painful breast conditions quickly, better manage their patients through triage, respond promptly to patient needs, and enhance the overall experience and care for breastfeeding mothers. Methods: We evaluated the potential for 5 classification models to detect breastfeeding-related conditions using 1078 breast and nipple images gathered from web-based and physical educational resources. We used the convolutional neural networks Resnet50, Visual Geometry Group model with 16 layers (VGG16), InceptionV3, EfficientNetV2, and DenseNet169 to classify the images across 7 classes: healthy, abscess, mastitis, nipple blebs, dermatosis, engorgement, and nipple damage by improper feeding or misuse of breast pumps. We also evaluated the models' ability to distinguish between healthy and unhealthy images. We present an analysis of the classification challenges, identifying image traits that may confound the detection model. Results: The best model achieves an average area under the receiver operating characteristic curve of 0.93 for all conditions after data augmentation for multiclass classification. For binary classification, we achieved, with the best model, an average area under the curve of 0.96 for all conditions after data augmentation. Several factors contributed to the misclassification of images, including similar visual features in the conditions that precede other conditions (such as the mastitis spectrum disorder), partially covered breasts or nipples, and images depicting multiple conditions in the same breast. Conclusions: This vision-based automated detection technique offers an opportunity to enhance postpartum care for mothers and can potentially help alleviate the workload of LCs by expediting decision-making processes. ", doi="10.2196/54798", url="https://ai.jmir.org/2024/1/e54798" } @Article{info:doi/10.2196/55957, author="Bragazzi, Luigi Nicola and Garbarino, Sergio", title="Toward Clinical Generative AI: Conceptual Framework", journal="JMIR AI", year="2024", month="Jun", day="7", volume="3", pages="e55957", keywords="clinical intelligence", keywords="artificial intelligence", keywords="iterative process", keywords="abduction", keywords="benchmarking", keywords="verification paradigms", doi="10.2196/55957", url="https://ai.jmir.org/2024/1/e55957", url="http://www.ncbi.nlm.nih.gov/pubmed/38875592" } @Article{info:doi/10.2196/54501, author="Jordan, Alexis and Park, Albert", title="Understanding the Long Haulers of COVID-19: Mixed Methods Analysis of YouTube Content", journal="JMIR AI", year="2024", month="Jun", day="3", volume="3", pages="e54501", keywords="long haulers", keywords="post--COVID-19 condition", keywords="COVID-19", keywords="YouTube", keywords="topic modeling", keywords="natural language processing", abstract="Background: The COVID-19 pandemic had a devastating global impact. In the United States, there were >98 million COVID-19 cases and >1 million resulting deaths. One consequence of COVID-19 infection has been post--COVID-19 condition (PCC). People with this syndrome, colloquially called long haulers, experience symptoms that impact their quality of life. The root cause of PCC and effective treatments remains unknown. Many long haulers have turned to social media for support and guidance. Objective: In this study, we sought to gain a better understanding of the long hauler experience by investigating what has been discussed and how information about long haulers is perceived on social media. We specifically investigated the following: (1) the range of symptoms that are discussed, (2) the ways in which information about long haulers is perceived, (3) informational and emotional support that is available to long haulers, and (4) discourse between viewers and creators. We selected YouTube as our data source due to its popularity and wide range of audience. Methods: We systematically gathered data from 3 different types of content creators: medical sources, news sources, and long haulers. To computationally understand the video content and viewers' reactions, we used Biterm, a topic modeling algorithm created specifically for short texts, to analyze snippets of video transcripts and all top-level comments from the comment section. To triangulate our findings about viewers' reactions, we used the Valence Aware Dictionary and Sentiment Reasoner to conduct sentiment analysis on comments from each type of content creator. We grouped the comments into positive and negative categories and generated topics for these groups using Biterm. We then manually grouped resulting topics into broader themes for the purpose of analysis. Results: We organized the resulting topics into 28 themes across all sources. Examples of medical source transcript themes were Explanations in layman's terms and Biological explanations. Examples of news source transcript themes were Negative experiences and handling the long haul. The 2 long hauler transcript themes were Taking treatments into own hands and Changes to daily life. News sources received a greater share of negative comments. A few themes of these negative comments included Misinformation and disinformation and Issues with the health care system. Similarly, negative long hauler comments were organized into several themes, including Disillusionment with the health care system and Requiring more visibility. In contrast, positive medical source comments captured themes such as Appreciation of helpful content and Exchange of helpful information. In addition to this theme, one positive theme found in long hauler comments was Community building. Conclusions: The results of this study could help public health agencies, policy makers, organizations, and health researchers understand symptomatology and experiences related to PCC. They could also help these agencies develop their communication strategy concerning PCC. ", doi="10.2196/54501", url="https://ai.jmir.org/2024/1/e54501", url="http://www.ncbi.nlm.nih.gov/pubmed/38875666" } @Article{info:doi/10.2196/58342, author="Noda, Masao and Yoshimura, Hidekane and Okubo, Takuya and Koshu, Ryota and Uchiyama, Yuki and Nomura, Akihiro and Ito, Makoto and Takumi, Yutaka", title="Feasibility of Multimodal Artificial Intelligence Using GPT-4 Vision for the Classification of Middle Ear Disease: Qualitative Study and Validation", journal="JMIR AI", year="2024", month="May", day="31", volume="3", pages="e58342", keywords="artificial intelligence", keywords="deep learning", keywords="machine learning", keywords="generative AI", keywords="generative", keywords="tympanic membrane", keywords="middle ear disease", keywords="GPT4-Vision", keywords="otolaryngology", keywords="ears", keywords="ear", keywords="tympanic", keywords="vision", keywords="GPT", keywords="GPT4V", keywords="otoscopic", keywords="image", keywords="images", keywords="imaging", keywords="diagnosis", keywords="diagnoses", keywords="diagnostic", keywords="diagnostics", keywords="otitis", keywords="mobile phone", abstract="Background: The integration of artificial intelligence (AI), particularly deep learning models, has transformed the landscape of medical technology, especially in the field of diagnosis using imaging and physiological data. In otolaryngology, AI has shown promise in image classification for middle ear diseases. However, existing models often lack patient-specific data and clinical context, limiting their universal applicability. The emergence of GPT-4 Vision (GPT-4V) has enabled a multimodal diagnostic approach, integrating language processing with image analysis. Objective: In this study, we investigated the effectiveness of GPT-4V in diagnosing middle ear diseases by integrating patient-specific data with otoscopic images of the tympanic membrane. Methods: The design of this study was divided into two phases: (1) establishing a model with appropriate prompts and (2) validating the ability of the optimal prompt model to classify images. In total, 305 otoscopic images of 4 middle ear diseases (acute otitis media, middle ear cholesteatoma, chronic otitis media, and otitis media with effusion) were obtained from patients who visited Shinshu University or Jichi Medical University between April 2010 and December 2023. The optimized GPT-4V settings were established using prompts and patients' data, and the model created with the optimal prompt was used to verify the diagnostic accuracy of GPT-4V on 190 images. To compare the diagnostic accuracy of GPT-4V with that of physicians, 30 clinicians completed a web-based questionnaire consisting of 190 images. Results: The multimodal AI approach achieved an accuracy of 82.1\%, which is superior to that of certified pediatricians at 70.6\%, but trailing behind that of otolaryngologists at more than 95\%. The model's disease-specific accuracy rates were 89.2\% for acute otitis media, 76.5\% for chronic otitis media, 79.3\% for middle ear cholesteatoma, and 85.7\% for otitis media with effusion, which highlights the need for disease-specific optimization. Comparisons with physicians revealed promising results, suggesting the potential of GPT-4V to augment clinical decision-making. Conclusions: Despite its advantages, challenges such as data privacy and ethical considerations must be addressed. Overall, this study underscores the potential of multimodal AI for enhancing diagnostic accuracy and improving patient care in otolaryngology. Further research is warranted to optimize and validate this approach in diverse clinical settings. ", doi="10.2196/58342", url="https://ai.jmir.org/2024/1/e58342", url="http://www.ncbi.nlm.nih.gov/pubmed/38875669" } @Article{info:doi/10.2196/51756, author="Siegel, Nicole Leeann and Wiseman, P. Kara and Budenz, Alex and Prutzman, Yvonne", title="Identifying Patterns of Smoking Cessation App Feature Use That Predict Successful Quitting: Secondary Analysis of Experimental Data Leveraging Machine Learning", journal="JMIR AI", year="2024", month="May", day="22", volume="3", pages="e51756", keywords="smartphone apps", keywords="machine learning", keywords="artificial intelligence", keywords="smoking cessation", keywords="mHealth", keywords="mobile health", keywords="app", keywords="apps", keywords="applications", keywords="application feature", keywords="features", keywords="smoking", keywords="smoke", keywords="smoker", keywords="smokers", keywords="cessation", keywords="quit", keywords="quitting", keywords="algorithm", keywords="algorithms", keywords="mobile phone", abstract="Background: Leveraging free smartphone apps can help expand the availability and use of evidence-based smoking cessation interventions. However, there is a need for additional research investigating how the use of different features within such apps impacts their effectiveness. Objective: We used observational data collected from an experiment of a publicly available smoking cessation app to develop supervised machine learning (SML) algorithms intended to distinguish the app features that promote successful smoking cessation. We then assessed the extent to which patterns of app feature use accounted for variance in cessation that could not be explained by other known predictors of cessation (eg, tobacco use behaviors). Methods: Data came from an experiment (ClinicalTrials.gov NCT04623736) testing the impacts of incentivizing ecological momentary assessments within the National Cancer Institute's quitSTART app. Participants' (N=133) app activity, including every action they took within the app and its corresponding time stamp, was recorded. Demographic and baseline tobacco use characteristics were measured at the start of the experiment, and short-term smoking cessation (7-day point prevalence abstinence) was measured at 4 weeks after baseline. Logistic regression SML modeling was used to estimate participants' probability of cessation from 28 variables reflecting participants' use of different app features, assigned experimental conditions, and phone type (iPhone [Apple Inc] or Android [Google]). The SML model was first fit in a training set (n=100) and then its accuracy was assessed in a held-aside test set (n=33). Within the test set, a likelihood ratio test (n=30) assessed whether adding individuals' SML-predicted probabilities of cessation to a logistic regression model that included demographic and tobacco use (eg, polyuse) variables explained additional variance in 4-week cessation. Results: The SML model's sensitivity (0.67) and specificity (0.67) in the held-aside test set indicated that individuals' patterns of using different app features predicted cessation with reasonable accuracy. The likelihood ratio test showed that the logistic regression, which included the SML model--predicted probabilities, was statistically equivalent to the model that only included the demographic and tobacco use variables (P=.16). Conclusions: Harnessing user data through SML could help determine the features of smoking cessation apps that are most useful. This methodological approach could be applied in future research focusing on smoking cessation app features to inform the development and improvement of smoking cessation apps. Trial Registration: ClinicalTrials.gov NCT04623736; https://clinicaltrials.gov/study/NCT04623736 ", doi="10.2196/51756", url="https://ai.jmir.org/2024/1/e51756", url="http://www.ncbi.nlm.nih.gov/pubmed/38875564" } @Article{info:doi/10.2196/48067, author="Kamruzzaman, Methun and Heavey, Jack and Song, Alexander and Bielskas, Matthew and Bhattacharya, Parantapa and Madden, Gregory and Klein, Eili and Deng, Xinwei and Vullikanti, Anil", title="Improving Risk Prediction of Methicillin-Resistant Staphylococcus aureus Using Machine Learning Methods With Network Features: Retrospective Development Study", journal="JMIR AI", year="2024", month="May", day="16", volume="3", pages="e48067", keywords="methicillin-resistant Staphylococcus aureus", keywords="network", keywords="machine learning", keywords="penalized logistic regression", keywords="ensemble learning", keywords="gradient-boosted classifier", keywords="random forest classifier", keywords="extreme boosted gradient boosted classifier", keywords="Shapley Additive Explanations", keywords="SHAP", keywords="health care--associated infection", keywords="HAI", abstract="Background: Health care--associated infections due to multidrug-resistant organisms (MDROs), such as methicillin-resistant Staphylococcus aureus (MRSA) and Clostridioides difficile (CDI), place a significant burden on our health care infrastructure. Objective: Screening for MDROs is an important mechanism for preventing spread but is resource intensive. The objective of this study was to develop automated tools that can predict colonization or infection risk using electronic health record (EHR) data, provide useful information to aid infection control, and guide empiric antibiotic coverage. Methods: We retrospectively developed a machine learning model to detect MRSA colonization and infection in undifferentiated patients at the time of sample collection from hospitalized patients at the University of Virginia Hospital. We used clinical and nonclinical features derived from on-admission and throughout-stay information from the patient's EHR data to build the model. In addition, we used a class of features derived from contact networks in EHR data; these network features can capture patients' contacts with providers and other patients, improving model interpretability and accuracy for predicting the outcome of surveillance tests for MRSA. Finally, we explored heterogeneous models for different patient subpopulations, for example, those admitted to an intensive care unit or emergency department or those with specific testing histories, which perform better. Results: We found that the penalized logistic regression performs better than other methods, and this model's performance measured in terms of its receiver operating characteristics-area under the curve score improves by nearly 11\% when we use polynomial (second-degree) transformation of the features. Some significant features in predicting MDRO risk include antibiotic use, surgery, use of devices, dialysis, patient's comorbidity conditions, and network features. Among these, network features add the most value and improve the model's performance by at least 15\%. The penalized logistic regression model with the same transformation of features also performs better than other models for specific patient subpopulations. Conclusions: Our study shows that MRSA risk prediction can be conducted quite effectively by machine learning methods using clinical and nonclinical features derived from EHR data. Network features are the most predictive and provide significant improvement over prior methods. Furthermore, heterogeneous prediction models for different patient subpopulations enhance the model's performance. ", doi="10.2196/48067", url="https://ai.jmir.org/2024/1/e48067", url="http://www.ncbi.nlm.nih.gov/pubmed/38875598" } @Article{info:doi/10.2196/46875, author="Hammoud, Mohammad and Douglas, Shahd and Darmach, Mohamad and Alawneh, Sara and Sanyal, Swapnendu and Kanbour, Youssef", title="Evaluating the Diagnostic Performance of Symptom Checkers: Clinical Vignette Study", journal="JMIR AI", year="2024", month="Apr", day="29", volume="3", pages="e46875", keywords="digital health", keywords="symptom checker", keywords="artificial intelligence", keywords="AI", keywords="patient-centered care", keywords="eHealth apps", keywords="eHealth", abstract="Background: Medical self-diagnostic tools (or symptom checkers) are becoming an integral part of digital health and our daily lives, whereby patients are increasingly using them to identify the underlying causes of their symptoms. As such, it is essential to rigorously investigate and comprehensively report the diagnostic performance of symptom checkers using standard clinical and scientific approaches. Objective: This study aims to evaluate and report the accuracies of a few known and new symptom checkers using a standard and transparent methodology, which allows the scientific community to cross-validate and reproduce the reported results, a step much needed in health informatics. Methods: We propose a 4-stage experimentation methodology that capitalizes on the standard clinical vignette approach to evaluate 6 symptom checkers. To this end, we developed and peer-reviewed 400 vignettes, each approved by at least 5 out of 7 independent and experienced primary care physicians. To establish a frame of reference and interpret the results of symptom checkers accordingly, we further compared the best-performing symptom checker against 3 primary care physicians with an average experience of 16.6 (SD 9.42) years. To measure accuracy, we used 7 standard metrics, including M1 as a measure of a symptom checker's or a physician's ability to return a vignette's main diagnosis at the top of their differential list, F1-score as a trade-off measure between recall and precision, and Normalized Discounted Cumulative Gain (NDCG) as a measure of a differential list's ranking quality, among others. Results: The diagnostic accuracies of the 6 tested symptom checkers vary significantly. For instance, the differences in the M1, F1-score, and NDCG results between the best-performing and worst-performing symptom checkers or ranges were 65.3\%, 39.2\%, and 74.2\%, respectively. The same was observed among the participating human physicians, whereby the M1, F1-score, and NDCG ranges were 22.8\%, 15.3\%, and 21.3\%, respectively. When compared against each other, physicians outperformed the best-performing symptom checker by an average of 1.2\% using F1-score, whereas the best-performing symptom checker outperformed physicians by averages of 10.2\% and 25.1\% using M1 and NDCG, respectively. Conclusions: The performance variation between symptom checkers is substantial, suggesting that symptom checkers cannot be treated as a single entity. On a different note, the best-performing symptom checker was an artificial intelligence (AI)--based one, shedding light on the promise of AI in improving the diagnostic capabilities of symptom checkers, especially as AI keeps advancing exponentially. ", doi="10.2196/46875", url="https://ai.jmir.org/2024/1/e46875", url="http://www.ncbi.nlm.nih.gov/pubmed/38875676" } @Article{info:doi/10.2196/47194, author="Yan, Runze and Liu, Xinwen and Dutcher, M. Janine and Tumminia, J. Michael and Villalba, Daniella and Cohen, Sheldon and Creswell, D. John and Creswell, Kasey and Mankoff, Jennifer and Dey, K. Anind and Doryab, Afsaneh", title="Identifying Links Between Productivity and Biobehavioral Rhythms Modeled From Multimodal Sensor Streams: Exploratory Quantitative Study", journal="JMIR AI", year="2024", month="Apr", day="18", volume="3", pages="e47194", keywords="biobehavioral rhythms", keywords="productivity", keywords="computational modeling", keywords="mobile sensing", keywords="mobile phone", abstract="Background: Biobehavioral rhythms are biological, behavioral, and psychosocial processes with repeating cycles. Abnormal rhythms have been linked to various health issues, such as sleep disorders, obesity, and depression. Objective: This study aims to identify links between productivity and biobehavioral rhythms modeled from passively collected mobile data streams. Methods: In this study, we used a multimodal mobile sensing data set consisting of data collected from smartphones and Fitbits worn by 188 college students over a continuous period of 16 weeks. The participants reported their self-evaluated daily productivity score (ranging from 0 to 4) during weeks 1, 6, and 15. To analyze the data, we modeled cyclic human behavior patterns based on multimodal mobile sensing data gathered during weeks 1, 6, 15, and the adjacent weeks. Our methodology resulted in the creation of a rhythm model for each sensor feature. Additionally, we developed a correlation-based approach to identify connections between rhythm stability and high or low productivity levels. Results: Differences exist in the biobehavioral rhythms of high- and low-productivity students, with those demonstrating greater rhythm stability also exhibiting higher productivity levels. Notably, a negative correlation (C=--0.16) was observed between productivity and the SE of the phase for the 24-hour period during week 1, with a higher SE indicative of lower rhythm stability. Conclusions: Modeling biobehavioral rhythms has the potential to quantify and forecast productivity. The findings have implications for building novel cyber-human systems that align with human beings' biobehavioral rhythms to improve health, well-being, and work performance. ", doi="10.2196/47194", url="https://ai.jmir.org/2024/1/e47194" } @Article{info:doi/10.2196/40781, author="Waheed, Atif Muhammad and Liu, Lu", title="Perceptions of Family Physicians About Applying AI in Primary Health Care: Case Study From a Premier Health Care Organization", journal="JMIR AI", year="2024", month="Apr", day="17", volume="3", pages="e40781", keywords="AI", keywords="artificial intelligence", keywords="perception", keywords="attitude", keywords="opinion", keywords="surveys and questionnaires", keywords="family physician", keywords="primary care", keywords="health care service provider", keywords="health care professional", keywords="ethical", keywords="AI decision-making", keywords="AI challenges", abstract="Background: The COVID-19 pandemic has led to the rapid proliferation of artificial intelligence (AI), which was not previously anticipated; this is an unforeseen development. The use of AI in health care settings is increasing, as it proves to be a promising tool for transforming health care systems, improving operational and business processes, and efficiently simplifying health care tasks for family physicians and health care administrators. Therefore, it is necessary to assess the perspective of family physicians on AI and its impact on their job roles. Objective: This study aims to determine the impact of AI on the management and practices of Qatar's Primary Health Care Corporation (PHCC) in improving health care tasks and service delivery. Furthermore, it seeks to evaluate the impact of AI on family physicians' job roles, including associated risks and ethical ramifications from their perspective. Methods: We conducted a cross-sectional survey and sent a web-based questionnaire survey link to 724 practicing family physicians at the PHCC. In total, we received 102 eligible responses. Results: Of the 102 respondents, 72 (70.6\%) were men and 94 (92.2\%) were aged between 35 and 54 years. In addition, 58 (56.9\%) of the 102 respondents were consultants. The overall awareness of AI was 80 (78.4\%) out of 102, with no difference between gender (P=.06) and age groups (P=.12). AI is perceived to play a positive role in improving health care practices at PHCC (P<.001), managing health care tasks (P<.001), and positively impacting health care service delivery (P<.001). Family physicians also perceived that their clinical, administrative, and opportunistic health care management roles were positively influenced by AI (P<.001). Furthermore, perceptions of family physicians indicate that AI improves operational and human resource management (P<.001), does not undermine patient-physician relationships (P<.001), and is not considered superior to human physicians in the clinical judgment process (P<.001). However, its inclusion is believed to decrease patient satisfaction (P<.001). AI decision-making and accountability were recognized as ethical risks, along with data protection and confidentiality. The optimism regarding using AI for future medical decisions was low among family physicians. Conclusions: This study indicated a positive perception among family physicians regarding AI integration into primary care settings. AI demonstrates significant potential for enhancing health care task management and overall service delivery at the PHCC. It augments family physicians' roles without replacing them and proves beneficial for operational efficiency, human resource management, and public health during pandemics. While the implementation of AI is anticipated to bring benefits, the careful consideration of ethical, privacy, confidentiality, and patient-centric concerns is essential. These insights provide valuable guidance for the strategic integration of AI into health care systems, with a focus on maintaining high-quality patient care and addressing the multifaceted challenges that arise during this transformative process. ", doi="10.2196/40781", url="https://ai.jmir.org/2024/1/e40781", url="http://www.ncbi.nlm.nih.gov/pubmed/38875531" } @Article{info:doi/10.2196/52054, author="Wiepert, Daniela and Malin, A. Bradley and Duffy, R. Joseph and Utianski, L. Rene and Stricker, L. John and Jones, T. David and Botha, Hugo", title="Reidentification of Participants in Shared Clinical Data Sets: Experimental Study", journal="JMIR AI", year="2024", month="Mar", day="15", volume="3", pages="e52054", keywords="reidentification", keywords="privacy", keywords="adversarial attack", keywords="health care", keywords="speech disorders", keywords="voiceprint", abstract="Background: Large curated data sets are required to leverage speech-based tools in health care. These are costly to produce, resulting in increased interest in data sharing. As speech can potentially identify speakers (ie, voiceprints), sharing recordings raises privacy concerns. This is especially relevant when working with patient data protected under the Health Insurance Portability and Accountability Act. Objective: We aimed to determine the reidentification risk for speech recordings, without reference to demographics or metadata, in clinical data sets considering both the size of the search space (ie, the number of comparisons that must be considered when reidentifying) and the nature of the speech recording (ie, the type of speech task). Methods: Using a state-of-the-art speaker identification model, we modeled an adversarial attack scenario in which an adversary uses a large data set of identified speech (hereafter, the known set) to reidentify as many unknown speakers in a shared data set (hereafter, the unknown set) as possible. We first considered the effect of search space size by attempting reidentification with various sizes of known and unknown sets using VoxCeleb, a data set with recordings of natural, connected speech from >7000 healthy speakers. We then repeated these tests with different types of recordings in each set to examine whether the nature of a speech recording influences reidentification risk. For these tests, we used our clinical data set composed of recordings of elicited speech tasks from 941 speakers. Results: We found that the risk was inversely related to the number of comparisons an adversary must consider (ie, the search space), with a positive linear correlation between the number of false acceptances (FAs) and the number of comparisons (r=0.69; P<.001). The true acceptances (TAs) stayed relatively stable, and the ratio between FAs and TAs rose from 0.02 at 1 {\texttimes} 105 comparisons to 1.41 at 6 {\texttimes} 106 comparisons, with a near 1:1 ratio at the midpoint of 3 {\texttimes} 106 comparisons. In effect, risk was high for a small search space but dropped as the search space grew. We also found that the nature of a speech recording influenced reidentification risk, with nonconnected speech (eg, vowel prolongation: FA/TA=98.5; alternating motion rate: FA/TA=8) being harder to identify than connected speech (eg, sentence repetition: FA/TA=0.54) in cross-task conditions. The inverse was mostly true in within-task conditions, with the FA/TA ratio for vowel prolongation and alternating motion rate dropping to 0.39 and 1.17, respectively. Conclusions: Our findings suggest that speaker identification models can be used to reidentify participants in specific circumstances, but in practice, the reidentification risk appears small. The variation in risk due to search space size and type of speech task provides actionable recommendations to further increase participant privacy and considerations for policy regarding public release of speech recordings. ", doi="10.2196/52054", url="https://ai.jmir.org/2024/1/e52054", url="http://www.ncbi.nlm.nih.gov/pubmed/38875581" } @Article{info:doi/10.2196/53656, author="Sebo, Paul", title="What Is the Performance of ChatGPT in Determining the Gender of Individuals Based on Their First and Last Names?", journal="JMIR AI", year="2024", month="Mar", day="13", volume="3", pages="e53656", keywords="accuracy", keywords="artificial intelligence", keywords="AI", keywords="ChatGPT", keywords="gender", keywords="gender detection tool", keywords="misclassification", keywords="name", keywords="performance", keywords="gender detection", keywords="gender detection tools", keywords="inequalities", keywords="language model", keywords="NamSor", keywords="Gender API", keywords="Switzerland", keywords="physicians", keywords="gender bias", keywords="disparities", keywords="gender disparities", keywords="gender gap", doi="10.2196/53656", url="https://ai.jmir.org/2024/1/e53656" } @Article{info:doi/10.2196/52211, author="Ewals, S. Lotte J. and Heesterbeek, J. Lynn J. and Yu, Bin and van der Wulp, Kasper and Mavroeidis, Dimitrios and Funk, Mathias and Snijders, P. Chris C. and Jacobs, Igor and Nederend, Joost and Pluyter, R. Jon and ", title="The Impact of Expectation Management and Model Transparency on Radiologists' Trust and Utilization of AI Recommendations for Lung Nodule Assessment on Computed Tomography: Simulated Use Study", journal="JMIR AI", year="2024", month="Mar", day="13", volume="3", pages="e52211", keywords="application", keywords="artificial intelligence", keywords="AI", keywords="computer-aided detection or diagnosis", keywords="CAD", keywords="design", keywords="human centered", keywords="human computer interaction", keywords="HCI", keywords="interaction", keywords="mental model", keywords="radiologists", keywords="trust", abstract="Background: Many promising artificial intelligence (AI) and computer-aided detection and diagnosis systems have been developed, but few have been successfully integrated into clinical practice. This is partially owing to a lack of user-centered design of AI-based computer-aided detection or diagnosis (AI-CAD) systems. Objective: We aimed to assess the impact of different onboarding tutorials and levels of AI model explainability on radiologists' trust in AI and the use of AI recommendations in lung nodule assessment on computed tomography (CT) scans. Methods: In total, 20 radiologists from 7 Dutch medical centers performed lung nodule assessment on CT scans under different conditions in a simulated use study as part of a 2{\texttimes}2 repeated-measures quasi-experimental design. Two types of AI onboarding tutorials (reflective vs informative) and 2 levels of AI output (black box vs explainable) were designed. The radiologists first received an onboarding tutorial that was either informative or reflective. Subsequently, each radiologist assessed 7 CT scans, first without AI recommendations. AI recommendations were shown to the radiologist, and they could adjust their initial assessment. Half of the participants received the recommendations via black box AI output and half received explainable AI output. Mental model and psychological trust were measured before onboarding, after onboarding, and after assessing the 7 CT scans. We recorded whether radiologists changed their assessment on found nodules, malignancy prediction, and follow-up advice for each CT assessment. In addition, we analyzed whether radiologists' trust in their assessments had changed based on the AI recommendations. Results: Both variations of onboarding tutorials resulted in a significantly improved mental model of the AI-CAD system (informative P=.01 and reflective P=.01). After using AI-CAD, psychological trust significantly decreased for the group with explainable AI output (P=.02). On the basis of the AI recommendations, radiologists changed the number of reported nodules in 27 of 140 assessments, malignancy prediction in 32 of 140 assessments, and follow-up advice in 12 of 140 assessments. The changes were mostly an increased number of reported nodules, a higher estimated probability of malignancy, and earlier follow-up. The radiologists' confidence in their found nodules changed in 82 of 140 assessments, in their estimated probability of malignancy in 50 of 140 assessments, and in their follow-up advice in 28 of 140 assessments. These changes were predominantly increases in confidence. The number of changed assessments and radiologists' confidence did not significantly differ between the groups that received different onboarding tutorials and AI outputs. Conclusions: Onboarding tutorials help radiologists gain a better understanding of AI-CAD and facilitate the formation of a correct mental model. If AI explanations do not consistently substantiate the probability of malignancy across patient cases, radiologists' trust in the AI-CAD system can be impaired. Radiologists' confidence in their assessments was improved by using the AI recommendations. ", doi="10.2196/52211", url="https://ai.jmir.org/2024/1/e52211", url="http://www.ncbi.nlm.nih.gov/pubmed/38875574" } @Article{info:doi/10.2196/48295, author="Young, A. Joshua and Chang, Chin-Wen and Scales, W. Charles and Menon, V. Saurabh and Holy, E. Chantal and Blackie, Adrienne Caroline", title="Machine Learning Methods Using Artificial Intelligence Deployed on Electronic Health Record Data for Identification and Referral of At-Risk Patients From Primary Care Physicians to Eye Care Specialists: Retrospective, Case-Controlled Study", journal="JMIR AI", year="2024", month="Mar", day="12", volume="3", pages="e48295", keywords="decision support for health professionals", keywords="tools, programs and algorithms", keywords="electronic health record", keywords="primary care", keywords="artificial intelligence", keywords="AI", keywords="prediction accuracy", keywords="triaging", keywords="AI model", keywords="eye care", keywords="ophthalmic", abstract="Background: Identification and referral of at-risk patients from primary care practitioners (PCPs) to eye care professionals remain a challenge. Approximately 1.9 million Americans suffer from vision loss as a result of undiagnosed or untreated ophthalmic conditions. In ophthalmology, artificial intelligence (AI) is used to predict glaucoma progression, recognize diabetic retinopathy (DR), and classify ocular tumors; however, AI has not yet been used to triage primary care patients for ophthalmology referral. Objective: This study aimed to build and compare machine learning (ML) methods, applicable to electronic health records (EHRs) of PCPs, capable of triaging patients for referral to eye care specialists. Methods: Accessing the Optum deidentified EHR data set, 743,039 patients with 5 leading vision conditions (age-related macular degeneration [AMD], visually significant cataract, DR, glaucoma, or ocular surface disease [OSD]) were exact-matched on age and gender to 743,039 controls without eye conditions. Between 142 and 182 non-ophthalmic parameters per patient were input into 5 ML methods: generalized linear model, L1-regularized logistic regression, random forest, Extreme Gradient Boosting (XGBoost), and J48 decision tree. Model performance was compared for each pathology to select the most predictive algorithm. The area under the curve (AUC) was assessed for all algorithms for each outcome. Results: XGBoost demonstrated the best performance, showing, respectively, a prediction accuracy and an AUC of 78.6\% (95\% CI 78.3\%-78.9\%) and 0.878 for visually significant cataract, 77.4\% (95\% CI 76.7\%-78.1\%) and 0.858 for exudative AMD, 79.2\% (95\% CI 78.8\%-79.6\%) and 0.879 for nonexudative AMD, 72.2\% (95\% CI 69.9\%-74.5\%) and 0.803 for OSD requiring medication, 70.8\% (95\% CI 70.5\%-71.1\%) and 0.785 for glaucoma, 85.0\% (95\% CI 84.2\%-85.8\%) and 0.924 for type 1 nonproliferative diabetic retinopathy (NPDR), 82.2\% (95\% CI 80.4\%-84.0\%) and 0.911 for type 1 proliferative diabetic retinopathy (PDR), 81.3\% (95\% CI 81.0\%-81.6\%) and 0.891 for type 2 NPDR, and 82.1\% (95\% CI 81.3\%-82.9\%) and 0.900 for type 2 PDR. Conclusions: The 5 ML methods deployed were able to successfully identify patients with elevated odds ratios (ORs), thus capable of patient triage, for ocular pathology ranging from 2.4 (95\% CI 2.4-2.5) for glaucoma to 5.7 (95\% CI 5.0-6.4) for type 1 NPDR, with an average OR of 3.9. The application of these models could enable PCPs to better identify and triage patients at risk for treatable ophthalmic pathology. Early identification of patients with unrecognized sight-threatening conditions may lead to earlier treatment and a reduced economic burden. More importantly, such triage may improve patients' lives. ", doi="10.2196/48295", url="https://ai.jmir.org/2024/1/e48295", url="http://www.ncbi.nlm.nih.gov/pubmed/38875582" } @Article{info:doi/10.2196/50525, author="Goh, WB Wilson and Chia, YA Kendrick and Cheung, FK Max and Kee, M. Kalya and Lwin, O. May and Schulz, J. Peter and Chen, Minhu and Wu, Kaichun and Ng, SM Simon and Lui, Rashid and Ang, Leong Tiing and Yeoh, Guan Khay and Chiu, Han-mo and Wu, Deng-chyang and Sung, JY Joseph", title="Risk Perception, Acceptance, and Trust of Using AI in Gastroenterology Practice in the Asia-Pacific Region: Web-Based Survey Study", journal="JMIR AI", year="2024", month="Mar", day="7", volume="3", pages="e50525", keywords="artificial intelligence", keywords="delivery of health care", keywords="gastroenterology", keywords="acceptance", keywords="trust", keywords="adoption", keywords="survey", keywords="surveys", keywords="questionnaire", keywords="questionnaires", keywords="detect", keywords="detection", keywords="colonoscopy", keywords="gastroenterologist", keywords="gastroenterologists", keywords="internal medicine", keywords="polyp", keywords="polyps", keywords="surgeon", keywords="surgeons", keywords="surgery", keywords="surgical", keywords="colorectal", abstract="Background: The use of artificial intelligence (AI) can revolutionize health care, but this raises risk concerns. It is therefore crucial to understand how clinicians trust and accept AI technology. Gastroenterology, by its nature of being an image-based and intervention-heavy specialty, is an area where AI-assisted diagnosis and management can be applied extensively. Objective: This study aimed to study how gastroenterologists or gastrointestinal surgeons accept and trust the use of AI in computer-aided detection (CADe), computer-aided characterization (CADx), and computer-aided intervention (CADi) of colorectal polyps in colonoscopy. Methods: We conducted a web-based questionnaire from November 2022 to January 2023, involving 5 countries or areas in the Asia-Pacific region. The questionnaire included variables such as background and demography of users; intention to use AI, perceived risk; acceptance; and trust in AI-assisted detection, characterization, and intervention. We presented participants with 3 AI scenarios related to colonoscopy and the management of colorectal polyps. These scenarios reflect existing AI applications in colonoscopy, namely the detection of polyps (CADe), characterization of polyps (CADx), and AI-assisted polypectomy (CADi). Results: In total, 165 gastroenterologists and gastrointestinal surgeons responded to a web-based survey using the structured questionnaire designed by experts in medical communications. Participants had a mean age of 44 (SD 9.65) years, were mostly male (n=116, 70.3\%), and mostly worked in publicly funded hospitals (n=110, 66.67\%). Participants reported relatively high exposure to AI, with 111 (67.27\%) reporting having used AI for clinical diagnosis or treatment of digestive diseases. Gastroenterologists are highly interested to use AI in diagnosis but show different levels of reservations in risk prediction and acceptance of AI. Most participants (n=112, 72.72\%) also expressed interest to use AI in their future practice. CADe was accepted by 83.03\% (n=137) of respondents, CADx was accepted by 78.79\% (n=130), and CADi was accepted by 72.12\% (n=119). CADe and CADx were trusted by 85.45\% (n=141) of respondents and CADi was trusted by 72.12\% (n=119). There were no application-specific differences in risk perceptions, but more experienced clinicians gave lesser risk ratings. Conclusions: Gastroenterologists reported overall high acceptance and trust levels of using AI-assisted colonoscopy in the management of colorectal polyps. However, this level of trust depends on the application scenario. Moreover, the relationship among risk perception, acceptance, and trust in using AI in gastroenterology practice is not straightforward. ", doi="10.2196/50525", url="https://ai.jmir.org/2024/1/e50525", url="http://www.ncbi.nlm.nih.gov/pubmed/38875591" } @Article{info:doi/10.2196/47122, author="Rodriguez, V. Danissa and Chen, Ji and Viswanadham, N. Ratnalekha V. and Lawrence, Katharine and Mann, Devin", title="Leveraging Machine Learning to Develop Digital Engagement Phenotypes of Users in a Digital Diabetes Prevention Program: Evaluation Study", journal="JMIR AI", year="2024", month="Mar", day="1", volume="3", pages="e47122", keywords="machine learning", keywords="digital health", keywords="diabetes", keywords="mobile health", keywords="messaging platforms", keywords="user engagement", keywords="patient behavior", keywords="digital diabetes prevention programs", keywords="digital phenotypes", keywords="digital prescription", keywords="users", keywords="prevention", keywords="evaluation study", keywords="communication", keywords="support", keywords="engagement", keywords="phenotypes", keywords="digital health intervention", keywords="chronic disease management", abstract="Background: Digital diabetes prevention programs (dDPPs) are effective ``digital prescriptions'' but have high attrition rates and program noncompletion. To address this, we developed a personalized automatic messaging system (PAMS) that leverages SMS text messaging and data integration into clinical workflows to increase dDPP engagement via enhanced patient-provider communication. Preliminary data showed positive results. However, further investigation is needed to determine how to optimize the tailoring of support technology such as PAMS based on a user's preferences to boost their dDPP engagement. Objective: This study evaluates leveraging machine learning (ML) to develop digital engagement phenotypes of dDPP users and assess ML's accuracy in predicting engagement with dDPP activities. This research will be used in a PAMS optimization process to improve PAMS personalization by incorporating engagement prediction and digital phenotyping. This study aims (1) to prove the feasibility of using dDPP user-collected data to build an ML model that predicts engagement and contributes to identifying digital engagement phenotypes, (2) to describe methods for developing ML models with dDPP data sets and present preliminary results, and (3) to present preliminary data on user profiling based on ML model outputs. Methods: Using the gradient-boosted forest model, we predicted engagement in 4 dDPP individual activities (physical activity, lessons, social activity, and weigh-ins) and general activity (engagement in any activity) based on previous short- and long-term activity in the app. The area under the receiver operating characteristic curve, the area under the precision-recall curve, and the Brier score metrics determined the performance of the model. Shapley values reflected the feature importance of the models and determined what variables informed user profiling through latent profile analysis. Results: We developed 2 models using weekly and daily DPP data sets (328,821 and 704,242 records, respectively), which yielded predictive accuracies above 90\%. Although both models were highly accurate, the daily model better fitted our research plan because it predicted daily changes in individual activities, which was crucial for creating the ``digital phenotypes.'' To better understand the variables contributing to the model predictor, we calculated the Shapley values for both models to identify the features with the highest contribution to model fit; engagement with any activity in the dDPP in the last 7 days had the most predictive power. We profiled users with latent profile analysis after 2 weeks of engagement (Bayesian information criterion=?3222.46) with the dDPP and identified 6 profiles of users, including those with high engagement, minimal engagement, and attrition. Conclusions: Preliminary results demonstrate that applying ML methods with predicting power is an acceptable mechanism to tailor and optimize messaging interventions to support patient engagement and adherence to digital prescriptions. The results enable future optimization of our existing messaging platform and expansion of this methodology to other clinical domains. Trial Registration: ClinicalTrials.gov NCT04773834; https://www.clinicaltrials.gov/ct2/show/NCT04773834 International Registered Report Identifier (IRRID): RR2-10.2196/26750 ", doi="10.2196/47122", url="https://ai.jmir.org/2024/1/e47122", url="http://www.ncbi.nlm.nih.gov/pubmed/38875579" } @Article{info:doi/10.2196/51535, author="Racine, Nicole and Chow, Cheryl and Hamwi, Lojain and Bucsea, Oana and Cheng, Carol and Du, Hang and Fabrizi, Lorenzo and Jasim, Sara and Johannsson, Lesley and Jones, Laura and Laudiano-Dray, Pureza Maria and Meek, Judith and Mistry, Neelum and Shah, Vibhuti and Stedman, Ian and Wang, Xiaogang and Riddell, Pillai Rebecca", title="Health Care Professionals' and Parents' Perspectives on the Use of AI for Pain Monitoring in the Neonatal Intensive Care Unit: Multisite Qualitative Study", journal="JMIR AI", year="2024", month="Feb", day="9", volume="3", pages="e51535", keywords="pain monitoring", keywords="pain management", keywords="preterm infant", keywords="neonate", keywords="pain", keywords="infant", keywords="infants", keywords="neonates", keywords="newborn", keywords="newborns", keywords="neonatal", keywords="baby", keywords="babies", keywords="pediatric", keywords="pediatrics", keywords="preterm", keywords="premature", keywords="assessment", keywords="intensive care", keywords="NICU", keywords="neonatal intensive care unit", keywords="HCP", keywords="health care professional", keywords="health care professionals", keywords="experience", keywords="experiences", keywords="attitude", keywords="attitudes", keywords="opinion", keywords="perception", keywords="perceptions", keywords="perspective", keywords="perspectives", keywords="acceptance", keywords="adoption", keywords="willingness", keywords="artificial intelligence", keywords="AI", keywords="digital health", keywords="health technology", keywords="health technologies", keywords="interview", keywords="interviews", keywords="parent", keywords="parents", abstract="Background: The use of artificial intelligence (AI) for pain assessment has the potential to address historical challenges in infant pain assessment. There is a dearth of information on the perceived benefits and barriers to the implementation of AI for neonatal pain monitoring in the neonatal intensive care unit (NICU) from the perspective of health care professionals (HCPs) and parents. This qualitative analysis provides novel data obtained from 2 large tertiary care hospitals in Canada and the United Kingdom. Objective: The aim of the study is to explore the perspectives of HCPs and parents regarding the use of AI for pain assessment in the NICU. Methods: In total, 20 HCPs and 20 parents of preterm infants were recruited and consented to participate from February 2020 to October 2022 in interviews asking about AI use for pain assessment in the NICU, potential benefits of the technology, and potential barriers to use. Results: The 40 participants included 20 HCPs (17 women and 3 men) with an average of 19.4 (SD 10.69) years of experience in the NICU and 20 parents (mean age 34.4, SD 5.42 years) of preterm infants who were on average 43 (SD 30.34) days old. Six themes from the perspective of HCPs were identified: regular use of technology in the NICU, concerns with regard to AI integration, the potential to improve patient care, requirements for implementation, AI as a tool for pain assessment, and ethical considerations. Seven parent themes included the potential for improved care, increased parental distress, support for parents regarding AI, the impact on parent engagement, the importance of human care, requirements for integration, and the desire for choice in its use. A consistent theme was the importance of AI as a tool to inform clinical decision-making and not replace it. Conclusions: HCPs and parents expressed generally positive sentiments about the potential use of AI for pain assessment in the NICU, with HCPs highlighting important ethical considerations. This study identifies critical methodological and ethical perspectives from key stakeholders that should be noted by any team considering the creation and implementation of AI for pain monitoring in the NICU. ", doi="10.2196/51535", url="https://ai.jmir.org/2024/1/e51535", url="http://www.ncbi.nlm.nih.gov/pubmed/38875686" } @Article{info:doi/10.2196/44185, author="Pan, Cheng and Luo, Hao and Cheung, Gary and Zhou, Huiquan and Cheng, Reynold and Cullum, Sarah and Wu, Chuan", title="Identifying Frailty in Older Adults Receiving Home Care Assessment Using Machine Learning: Longitudinal Observational Study on the Role of Classifier, Feature Selection, and Sample Size", journal="JMIR AI", year="2024", month="Jan", day="31", volume="3", pages="e44185", keywords="machine learning", keywords="logistic regression", keywords="frailty", keywords="older adults", keywords="home care", keywords="sample size", keywords="features", keywords="data set", keywords="model", keywords="mortality prediction", keywords="assessment", abstract="Background: Machine learning techniques are starting to be used in various health care data sets to identify frail persons who may benefit from interventions. However, evidence about the performance of machine learning techniques compared to conventional regression is mixed. It is also unclear what methodological and database factors are associated with performance. Objective: This study aimed to compare the mortality prediction accuracy of various machine learning classifiers for identifying frail older adults in different scenarios. Methods: We used deidentified data collected from older adults (65 years of age and older) assessed with interRAI-Home Care instrument in New Zealand between January 1, 2012, and December 31, 2016. A total of 138 interRAI assessment items were used to predict 6-month and 12-month mortality, using 3 machine learning classifiers (random forest [RF], extreme gradient boosting [XGBoost], and multilayer perceptron [MLP]) and regularized logistic regression. We conducted a simulation study comparing the performance of machine learning models with logistic regression and interRAI Home Care Frailty Scale and examined the effects of sample sizes, the number of features, and train-test split ratios. Results: A total of 95,042 older adults (median age 82.66 years, IQR 77.92-88.76; n=37,462, 39.42\% male) receiving home care were analyzed. The average area under the curve (AUC) and sensitivities of 6-month mortality prediction showed that machine learning classifiers did not outperform regularized logistic regressions. In terms of AUC, regularized logistic regression had better performance than XGBoost, MLP, and RF when the number of features was ?80 and the sample size ?16,000; MLP outperformed regularized logistic regression in terms of sensitivities when the number of features was ?40 and the sample size ?4000. Conversely, RF and XGBoost demonstrated higher specificities than regularized logistic regression in all scenarios. Conclusions: The study revealed that machine learning models exhibited significant variation in prediction performance when evaluated using different metrics. Regularized logistic regression was an effective model for identifying frail older adults receiving home care, as indicated by the AUC, particularly when the number of features and sample sizes were not excessively large. Conversely, MLP displayed superior sensitivity, while RF exhibited superior specificity when the number of features and sample sizes were large. ", doi="10.2196/44185", url="https://ai.jmir.org/2024/1/e44185" } @Article{info:doi/10.2196/47240, author="Lu, Jiahui and Zhang, Huibin and Xiao, Yi and Wang, Yingyu", title="An Environmental Uncertainty Perception Framework for Misinformation Detection and Spread Prediction in the COVID-19 Pandemic: Artificial Intelligence Approach", journal="JMIR AI", year="2024", month="Jan", day="29", volume="3", pages="e47240", keywords="misinformation detection", keywords="misinformation spread prediction", keywords="uncertainty", keywords="COVID-19", keywords="information environment", abstract="Background: Amidst the COVID-19 pandemic, misinformation on social media has posed significant threats to public health. Detecting and predicting the spread of misinformation are crucial for mitigating its adverse effects. However, prevailing frameworks for these tasks have predominantly focused on post-level signals of misinformation, neglecting features of the broader information environment where misinformation originates and proliferates. Objective: This study aims to create a novel framework that integrates the uncertainty of the information environment into misinformation features, with the goal of enhancing the model's accuracy in tasks such as misinformation detection and predicting the scale of dissemination. The objective is to provide better support for online governance efforts during health crises. Methods: In this study, we embraced uncertainty features within the information environment and introduced a novel Environmental Uncertainty Perception (EUP) framework for the detection of misinformation and the prediction of its spread on social media. The framework encompasses uncertainty at 4 scales of the information environment: physical environment, macro-media environment, micro-communicative environment, and message framing. We assessed the effectiveness of the EUP using real-world COVID-19 misinformation data sets. Results: The experimental results demonstrated that the EUP alone achieved notably good performance, with detection accuracy at 0.753 and prediction accuracy at 0.71. These results were comparable to state-of-the-art baseline models such as bidirectional long short-term memory (BiLSTM; detection accuracy 0.733 and prediction accuracy 0.707) and bidirectional encoder representations from transformers (BERT; detection accuracy 0.755 and prediction accuracy 0.728). Additionally, when the baseline models collaborated with the EUP, they exhibited improved accuracy by an average of 1.98\% for the misinformation detection and 2.4\% for spread-prediction tasks. On unbalanced data sets, the EUP yielded relative improvements of 21.5\% and 5.7\% in macro-F1-score and area under the curve, respectively. Conclusions: This study makes a significant contribution to the literature by recognizing uncertainty features within information environments as a crucial factor for improving misinformation detection and spread-prediction algorithms during the pandemic. The research elaborates on the complexities of uncertain information environments for misinformation across 4 distinct scales, including the physical environment, macro-media environment, micro-communicative environment, and message framing. The findings underscore the effectiveness of incorporating uncertainty into misinformation detection and spread prediction, providing an interdisciplinary and easily implementable framework for the field. ", doi="10.2196/47240", url="https://ai.jmir.org/2024/1/e47240", url="http://www.ncbi.nlm.nih.gov/pubmed/38875583" } @Article{info:doi/10.2196/49082, author="Hansen, Steffan and Brandt, Joakim Carl and S{\o}ndergaard, Jens", title="Beyond the Hype---The Actual Role and Risks of AI in Today's Medical Practice: Comparative-Approach Study", journal="JMIR AI", year="2024", month="Jan", day="22", volume="3", pages="e49082", keywords="AI", keywords="artificial intelligence", keywords="ChatGPT-4", keywords="Microsoft Bing", keywords="general practice", keywords="ChatGPT", keywords="chatbot", keywords="chatbots", keywords="writing", keywords="academic", keywords="academia", keywords="Bing", abstract="Background: The evolution of artificial intelligence (AI) has significantly impacted various sectors, with health care witnessing some of its most groundbreaking contributions. Contemporary models, such as ChatGPT-4 and Microsoft Bing, have showcased capabilities beyond just generating text, aiding in complex tasks like literature searches and refining web-based queries. Objective: This study explores a compelling query: can AI author an academic paper independently? Our assessment focuses on four core dimensions: relevance (to ensure that AI's response directly addresses the prompt), accuracy (to ascertain that AI's information is both factually correct and current), clarity (to examine AI's ability to present coherent and logical ideas), and tone and style (to evaluate whether AI can align with the formality expected in academic writings). Additionally, we will consider the ethical implications and practicality of integrating AI into academic writing. Methods: To assess the capabilities of ChatGPT-4 and Microsoft Bing in the context of academic paper assistance in general practice, we used a systematic approach. ChatGPT-4, an advanced AI language model by Open AI, excels in generating human-like text and adapting responses based on user interactions, though it has a knowledge cut-off in September 2021. Microsoft Bing's AI chatbot facilitates user navigation on the Bing search engine, offering tailored search Results: In terms of relevance, ChatGPT-4 delved deeply into AI's health care role, citing academic sources and discussing diverse applications and concerns, while Microsoft Bing provided a concise, less detailed overview. In terms of accuracy, ChatGPT-4 correctly cited 72\% (23/32) of its peer-reviewed articles but included some nonexistent references. Microsoft Bing's accuracy stood at 46\% (6/13), supplemented by relevant non--peer-reviewed articles. In terms of clarity, both models conveyed clear, coherent text. ChatGPT-4 was particularly adept at detailing technical concepts, while Microsoft Bing was more general. In terms of tone, both models maintained an academic tone, but ChatGPT-4 exhibited superior depth and breadth in content delivery. Conclusions: Comparing ChatGPT-4 and Microsoft Bing for academic assistance revealed strengths and limitations. ChatGPT-4 excels in depth and relevance but falters in citation accuracy. Microsoft Bing is concise but lacks robust detail. Though both models have potential, neither can independently handle comprehensive academic tasks. As AI evolves, combining ChatGPT-4's depth with Microsoft Bing's up-to-date referencing could optimize academic support. Researchers should critically assess AI outputs to maintain academic credibility. ", doi="10.2196/49082", url="https://ai.jmir.org/2024/1/e49082" } @Article{info:doi/10.2196/51204, author="Weidener, Lukas and Fischer, Michael", title="Role of Ethics in Developing AI-Based Applications in Medicine: Insights From Expert Interviews and Discussion of Implications", journal="JMIR AI", year="2024", month="Jan", day="12", volume="3", pages="e51204", keywords="artificial intelligence", keywords="AI", keywords="medicine", keywords="ethics", keywords="expert interviews", keywords="AI development", keywords="AI ethics", abstract="Background: The integration of artificial intelligence (AI)--based applications in the medical field has increased significantly, offering potential improvements in patient care and diagnostics. However, alongside these advancements, there is growing concern about ethical considerations, such as bias, informed consent, and trust in the development of these technologies. Objective: This study aims to assess the role of ethics in the development of AI-based applications in medicine. Furthermore, this study focuses on the potential consequences of neglecting ethical considerations in AI development, particularly their impact on patients and physicians. Methods: Qualitative content analysis was used to analyze the responses from expert interviews. Experts were selected based on their involvement in the research or practical development of AI-based applications in medicine for at least 5 years, leading to the inclusion of 7 experts in the study. Results: The analysis revealed 3 main categories and 7 subcategories reflecting a wide range of views on the role of ethics in AI development. This variance underscores the subjectivity and complexity of integrating ethics into the development of AI in medicine. Although some experts view ethics as fundamental, others prioritize performance and efficiency, with some perceiving ethics as potential obstacles to technological progress. This dichotomy of perspectives clearly emphasizes the subjectivity and complexity surrounding the role of ethics in AI development, reflecting the inherent multifaceted nature of this issue. Conclusions: Despite the methodological limitations impacting the generalizability of the results, this study underscores the critical importance of consistent and integrated ethical considerations in AI development for medical applications. It advocates further research into effective strategies for ethical AI development, emphasizing the need for transparent and responsible practices, consideration of diverse data sources, physician training, and the establishment of comprehensive ethical and legal frameworks. ", doi="10.2196/51204", url="https://ai.jmir.org/2024/1/e51204", url="http://www.ncbi.nlm.nih.gov/pubmed/38875585" } @Article{info:doi/10.2196/50442, author="Odabashian, Roupen and Bastin, Donald and Jones, Georden and Manzoor, Maria and Tangestaniapour, Sina and Assad, Malke and Lakhani, Sunita and Odabashian, Maritsa and McGee, Sharon", title="Assessment of ChatGPT-3.5's Knowledge in Oncology: Comparative Study with ASCO-SEP Benchmarks", journal="JMIR AI", year="2024", month="Jan", day="12", volume="3", pages="e50442", keywords="artificial intelligence", keywords="ChatGPT-3.5", keywords="language model", keywords="medical oncology", abstract="Background: ChatGPT (Open AI) is a state-of-the-art large language model that uses artificial intelligence (AI) to address questions across diverse topics. The American Society of Clinical Oncology Self-Evaluation Program (ASCO-SEP) created a comprehensive educational program to help physicians keep up to date with the many rapid advances in the field. The question bank consists of multiple choice questions addressing the many facets of cancer care, including diagnosis, treatment, and supportive care. As ChatGPT applications rapidly expand, it becomes vital to ascertain if the knowledge of ChatGPT-3.5 matches the established standards that oncologists are recommended to follow. Objective: This study aims to evaluate whether ChatGPT-3.5's knowledge aligns with the established benchmarks that oncologists are expected to adhere to. This will furnish us with a deeper understanding of the potential applications of this tool as a support for clinical decision-making. Methods: We conducted a systematic assessment of the performance of ChatGPT-3.5 on the ASCO-SEP, the leading educational and assessment tool for medical oncologists in training and practice. Over 1000 multiple choice questions covering the spectrum of cancer care were extracted. Questions were categorized by cancer type or discipline, with subcategorization as treatment, diagnosis, or other. Answers were scored as correct if ChatGPT-3.5 selected the answer as defined by ASCO-SEP. Results: Overall, ChatGPT-3.5 achieved a score of 56.1\% (583/1040) for the correct answers provided. The program demonstrated varying levels of accuracy across cancer types or disciplines. The highest accuracy was observed in questions related to developmental therapeutics (8/10; 80\% correct), while the lowest accuracy was observed in questions related to gastrointestinal cancer (102/209; 48.8\% correct). There was no significant difference in the program's performance across the predefined subcategories of diagnosis, treatment, and other (P=.16, which is greater than .05). Conclusions: This study evaluated ChatGPT-3.5's oncology knowledge using the ASCO-SEP, aiming to address uncertainties regarding AI tools like ChatGPT in clinical decision-making. Our findings suggest that while ChatGPT-3.5 offers a hopeful outlook for AI in oncology, its present performance in ASCO-SEP tests necessitates further refinement to reach the requisite competency levels. Future assessments could explore ChatGPT's clinical decision support capabilities with real-world clinical scenarios, its ease of integration into medical workflows, and its potential to foster interdisciplinary collaboration and patient engagement in health care settings. ", doi="10.2196/50442", url="https://ai.jmir.org/2024/1/e50442" } @Article{info:doi/10.2196/46840, author="Irie, Fumi and Matsumoto, Koutarou and Matsuo, Ryu and Nohara, Yasunobu and Wakisaka, Yoshinobu and Ago, Tetsuro and Nakashima, Naoki and Kitazono, Takanari and Kamouchi, Masahiro", title="Predictive Performance of Machine Learning--Based Models for Poststroke Clinical Outcomes in Comparison With Conventional Prognostic Scores: Multicenter, Hospital-Based Observational Study", journal="JMIR AI", year="2024", month="Jan", day="11", volume="3", pages="e46840", keywords="brain infarction", keywords="outcome", keywords="prediction", keywords="machine learning", keywords="prognostic score", abstract="Background: Although machine learning is a promising tool for making prognoses, the performance of machine learning in predicting outcomes after stroke remains to be examined. Objective: This study aims to examine how much data-driven models with machine learning improve predictive performance for poststroke outcomes compared with conventional stroke prognostic scores and to elucidate how explanatory variables in machine learning--based models differ from the items of the stroke prognostic scores. Methods: We used data from 10,513 patients who were registered in a multicenter prospective stroke registry in Japan between 2007 and 2017. The outcomes were poor functional outcome (modified Rankin Scale score >2) and death at 3 months after stroke. Machine learning--based models were developed using all variables with regularization methods, random forests, or boosted trees. We selected 3 stroke prognostic scores, namely, ASTRAL (Acute Stroke Registry and Analysis of Lausanne), PLAN (preadmission comorbidities, level of consciousness, age, neurologic deficit), and iScore (Ischemic Stroke Predictive Risk Score) for comparison. Item-based regression models were developed using the items of these 3 scores. The model performance was assessed in terms of discrimination and calibration. To compare the predictive performance of the data-driven model with that of the item-based model, we performed internal validation after random splits of identical populations into 80\% of patients as a training set and 20\% of patients as a test set; the models were developed in the training set and were validated in the test set. We evaluated the contribution of each variable to the models and compared the predictors used in the machine learning--based models with the items of the stroke prognostic scores. Results: The mean age of the study patients was 73.0 (SD 12.5) years, and 59.1\% (6209/10,513) of them were men. The area under the receiver operating characteristic curves and the area under the precision-recall curves for predicting poststroke outcomes were higher for machine learning--based models than for item-based models in identical populations after random splits. Machine learning--based models also performed better than item-based models in terms of the Brier score. Machine learning--based models used different explanatory variables, such as laboratory data, from the items of the conventional stroke prognostic scores. Including these data in the machine learning--based models as explanatory variables improved performance in predicting outcomes after stroke, especially poststroke death. Conclusions: Machine learning--based models performed better in predicting poststroke outcomes than regression models using the items of conventional stroke prognostic scores, although they required additional variables, such as laboratory data, to attain improved performance. Further studies are warranted to validate the usefulness of machine learning in clinical settings. ", doi="10.2196/46840", url="https://ai.jmir.org/2024/1/e46840", url="http://www.ncbi.nlm.nih.gov/pubmed/38875590" } @Article{info:doi/10.2196/46317, author="Abrams, P. Matthew and Merchant, M. Raina and Meisel, F. Zachary and Pelullo, P. Arthur and Chandra Guntuku, Sharath and Agarwal, K. Anish", title="Association Between Online Reviews of Substance Use Disorder Treatment Facilities and Drug-Induced Mortality Rates: Cross-Sectional Analysis", journal="JMIR AI", year="2023", month="Dec", day="29", volume="2", pages="e46317", keywords="opioid use disorder", keywords="online reviews", keywords="drug-induced mortality", keywords="addiction", keywords="substance use disorder treatment", keywords="substance use disorder", keywords="patient-centered care", keywords="digital health", keywords="treatment", keywords="substance use", keywords="online review", keywords="drug use", keywords="mortality", keywords="database", keywords="detoxification", keywords="rehabilitation", keywords="communication", keywords="patient-centered", abstract="Background: Drug-induced mortality across the United States has continued to rise. To date, there are limited measures to evaluate patient preferences and priorities regarding substance use disorder (SUD) treatment, and many patients do not have access to evidence-based treatment options. Patients and their families seeking SUD treatment may begin their search for an SUD treatment facility online, where they can find information about individual facilities, as well as a summary of patient-generated web-based reviews via popular platforms such as Google or Yelp. Web-based reviews of health care facilities may reflect information about factors associated with positive or negative patient satisfaction. The association between patient satisfaction with SUD treatment and drug-induced mortality is not well understood. Objective: The objective of this study was to examine the association between online review content of SUD treatment facilities and drug-induced state mortality. Methods: A cross-sectional analysis of online reviews and ratings of Substance Abuse and Mental Health Services Administration (SAMHSA)--designated SUD treatment facilities listed between September 2005 and October 2021 was conducted. The primary outcomes were (1) mean online rating of SUD treatment facilities from 1 star (worst) to 5 stars (best) and (2) average drug-induced mortality rates from the Centers for Disease Control and Prevention (CDC) WONDER Database (2006-2019). Clusters of words with differential frequencies within reviews were identified. A 3-level linear model was used to estimate the association between online review ratings and drug-induced mortality. Results: A total of 589 SAMHSA-designated facilities (n=9597 reviews) were included in this study. Drug-induced mortality was compared with the average. Approximately half (24/47, 51\%) of states had below average (``low'') mortality rates (mean 13.40, SD 2.45 deaths per 100,000 people), and half (23/47, 49\%) had above average (``high'') drug-induced mortality rates (mean 21.92, SD 3.69 deaths per 100,000 people). The top 5 themes associated with low drug-induced mortality included detoxification and addiction rehabilitation services (r=0.26), gratitude for recovery (r=--0.25), thankful for treatment (r=--0.32), caring staff and amazing experience (r=--0.23), and individualized recovery programs (r=--0.20). The top 5 themes associated with high mortality were care from doctors or providers (r=0.24), rude and insensitive care (r=0.23), medication and prescriptions (r=0.22), front desk and reception experience (r=0.22), and dissatisfaction with communication (r=0.21). In the multilevel linear model, a state with a 10 deaths per 100,000 people increase in mortality was associated with a 0.30 lower average Yelp rating (P=.005). Conclusions: Lower online ratings of SUD treatment facilities were associated with higher drug-induced mortality at the state level. Elements of patient experience may be associated with state-level mortality. Identified themes from online, organically derived patient content can inform efforts to improve high-quality and patient-centered SUD care. ", doi="10.2196/46317", url="https://ai.jmir.org/2023/1/e46317", url="http://www.ncbi.nlm.nih.gov/pubmed/38875553" } @Article{info:doi/10.2196/45770, author="Gu, Jiasheng and Gao, Chongyang and Wang, Lili", title="The Evolution of Artificial Intelligence in Biomedicine: Bibliometric Analysis", journal="JMIR AI", year="2023", month="Dec", day="19", volume="2", pages="e45770", keywords="bibliometrics", keywords="trend forecasting", keywords="AI in medicine", keywords="Word2Vec", keywords="regression models", keywords="agglomerative clustering", keywords="usage", keywords="artificial intelligence", keywords="utilization", keywords="biomedical", keywords="effectiveness", keywords="AI trends", keywords="predictive model", keywords="development", abstract="Background: The utilization of artificial intelligence (AI) technologies in the biomedical field has attracted increasing attention in recent decades. Studying how past AI technologies have found their way into medicine over time can help to predict which current (and future) AI technologies have the potential to be utilized in medicine in the coming years, thereby providing a helpful reference for future research directions. Objective: The aim of this study was to predict the future trend of AI technologies used in different biomedical domains based on past trends of related technologies and biomedical domains. Methods: We collected a large corpus of articles from the PubMed database pertaining to the intersection of AI and biomedicine. Initially, we attempted to use regression on the extracted keywords alone; however, we found that this approach did not provide sufficient information. Therefore, we propose a method called ``background-enhanced prediction'' to expand the knowledge utilized by the regression algorithm by incorporating both the keywords and their surrounding context. This method of data construction resulted in improved performance across the six regression models evaluated. Our findings were confirmed through experiments on recurrent prediction and forecasting. Results: In our analysis using background information for prediction, we found that a window size of 3 yielded the best results, outperforming the use of keywords alone. Furthermore, utilizing data only prior to 2017, our regression projections for the period of 2017-2021 exhibited a high coefficient of determination (R2), which reached up to 0.78, demonstrating the effectiveness of our method in predicting long-term trends. Based on the prediction, studies related to proteins and tumors will be pushed out of the top 20 and become replaced by early diagnostics, tomography, and other detection technologies. These are certain areas that are well-suited to incorporate AI technology. Deep learning, machine learning, and neural networks continue to be the dominant AI technologies in biomedical applications. Generative adversarial networks represent an emerging technology with a strong growth trend. Conclusions: In this study, we explored AI trends in the biomedical field and developed a predictive model to forecast future trends. Our findings were confirmed through experiments on current trends. ", doi="10.2196/45770", url="https://ai.jmir.org/2023/1/e45770", url="http://www.ncbi.nlm.nih.gov/pubmed/38875563" } @Article{info:doi/10.2196/49023, author="Wilimitis, Drew and Walsh, G. Colin", title="Practical Considerations and Applied Examples of Cross-Validation for Model Development and Evaluation in Health Care: Tutorial", journal="JMIR AI", year="2023", month="Dec", day="18", volume="2", pages="e49023", keywords="predictive modeling", keywords="cross-validation", keywords="tutorial", keywords="model development", keywords="risk detection", keywords="clinical decision-making", keywords="electronic health care", keywords="eHealth data", keywords="health care data", keywords="data validation", keywords="artificial intelligence", keywords="AI", doi="10.2196/49023", url="https://ai.jmir.org/2023/1/e49023", url="http://www.ncbi.nlm.nih.gov/pubmed/38875530" } @Article{info:doi/10.2196/44358, author="Rollwage, Max and Habicht, Johanna and Juechems, Keno and Carrington, Ben and Viswanathan, Sruthi and Stylianou, Mona and Hauser, U. Tobias and Harper, Ross", title="Using Conversational AI to Facilitate Mental Health Assessments and Improve Clinical Efficiency Within Psychotherapy Services: Real-World Observational Study", journal="JMIR AI", year="2023", month="Dec", day="13", volume="2", pages="e44358", keywords="artificial intelligence", keywords="National Health Service", keywords="NHS", keywords="Improving Access to Psychological Therapies", keywords="IAPT", keywords="mental health", keywords="mental health assessment", keywords="triage", keywords="decision-support", keywords="referral", keywords="chatbot", keywords="psychotherapy", keywords="conversational agent", keywords="assessment", keywords="Talking Therapies", abstract="Background: Most mental health care providers face the challenge of increased demand for psychotherapy in the absence of increased funding or staffing. To overcome this supply-demand imbalance, care providers must increase the efficiency of service delivery. Objective: In this study, we examined whether artificial intelligence (AI)--enabled digital solutions can help mental health care practitioners to use their time more efficiently, and thus reduce strain on services and improve patient outcomes. Methods: In this study, we focused on the use of an AI solution (Limbic Access) to support initial patient referral and clinical assessment within the UK's National Health Service. Data were collected from 9 Talking Therapies services across England, comprising 64,862 patients. Results: We showed that the use of this AI solution improves clinical efficiency by reducing the time clinicians spend on mental health assessments. Furthermore, we found improved outcomes for patients using the AI solution in several key metrics, such as reduced wait times, reduced dropout rates, improved allocation to appropriate treatment pathways, and, most importantly, improved recovery rates. When investigating the mechanism by which the AI solution achieved these improvements, we found that the provision of clinically relevant information ahead of clinical assessment was critical for these observed effects. Conclusions: Our results emphasize the utility of using AI solutions to support the mental health workforce, further highlighting the potential of AI solutions to increase the efficiency of care delivery and improve clinical outcomes for patients. ", doi="10.2196/44358", url="https://ai.jmir.org/2023/1/e44358" } @Article{info:doi/10.2196/46717, author="Budiarto, Arif and Tsang, H. Kevin C. and Wilson, M. Andrew and Sheikh, Aziz and Shah, Ahmar Syed", title="Machine Learning--Based Asthma Attack Prediction Models From Routinely Collected Electronic Health Records: Systematic Scoping Review", journal="JMIR AI", year="2023", month="Dec", day="7", volume="2", pages="e46717", keywords="asthma attack", keywords="exacerbation", keywords="prognosis", keywords="machine learning", keywords="electronic health record", keywords="review", keywords="EHR", keywords="asthma", abstract="Background: An early warning tool to predict attacks could enhance asthma management and reduce the likelihood of serious consequences. Electronic health records (EHRs) providing access to historical data about patients with asthma coupled with machine learning (ML) provide an opportunity to develop such a tool. Several studies have developed ML-based tools to predict asthma attacks. Objective: This study aims to critically evaluate ML-based models derived using EHRs for the prediction of asthma attacks. Methods: We systematically searched PubMed and Scopus (the search period was between January 1, 2012, and January 31, 2023) for papers meeting the following inclusion criteria: (1) used EHR data as the main data source, (2) used asthma attack as the outcome, and (3) compared ML-based prediction models' performance. We excluded non-English papers and nonresearch papers, such as commentary and systematic review papers. In addition, we also excluded papers that did not provide any details about the respective ML approach and its result, including protocol papers. The selected studies were then summarized across multiple dimensions including data preprocessing methods, ML algorithms, model validation, model explainability, and model implementation. Results: Overall, 17 papers were included at the end of the selection process. There was considerable heterogeneity in how asthma attacks were defined. Of the 17 studies, 8 (47\%) studies used routinely collected data both from primary care and secondary care practices together. Extreme imbalanced data was a notable issue in most studies (13/17, 76\%), but only 38\% (5/13) of them explicitly dealt with it in their data preprocessing pipeline. The gradient boosting--based method was the best ML method in 59\% (10/17) of the studies. Of the 17 studies, 14 (82\%) studies used a model explanation method to identify the most important predictors. None of the studies followed the standard reporting guidelines, and none were prospectively validated. Conclusions: Our review indicates that this research field is still underdeveloped, given the limited body of evidence, heterogeneity of methods, lack of external validation, and suboptimally reported models. We highlighted several technical challenges (class imbalance, external validation, model explanation, and adherence to reporting guidelines to aid reproducibility) that need to be addressed to make progress toward clinical adoption. ", doi="10.2196/46717", url="https://ai.jmir.org/2023/1/e46717", url="http://www.ncbi.nlm.nih.gov/pubmed/38875586" } @Article{info:doi/10.2196/52888, author="Hendricks-Sturrup, Rachele and Simmons, Malaika and Anders, Shilo and Aneni, Kammarauche and Wright Clayton, Ellen and Coco, Joseph and Collins, Benjamin and Heitman, Elizabeth and Hussain, Sajid and Joshi, Karuna and Lemieux, Josh and Lovett Novak, Laurie and Rubin, J. Daniel and Shanker, Anil and Washington, Talitha and Waters, Gabriella and Webb Harris, Joyce and Yin, Rui and Wagner, Teresa and Yin, Zhijun and Malin, Bradley", title="Developing Ethics and Equity Principles, Terms, and Engagement Tools to Advance Health Equity and Researcher Diversity in AI and Machine Learning: Modified Delphi Approach", journal="JMIR AI", year="2023", month="Dec", day="6", volume="2", pages="e52888", keywords="artificial intelligence", keywords="AI", keywords="Delphi", keywords="disparities", keywords="disparity", keywords="engagement", keywords="equitable", keywords="equities", keywords="equity", keywords="ethic", keywords="ethical", keywords="ethics", keywords="fair", keywords="fairness", keywords="health disparities", keywords="health equity", keywords="humanitarian", keywords="machine learning", keywords="ML", abstract="Background: Artificial intelligence (AI) and machine learning (ML) technology design and development continues to be rapid, despite major limitations in its current form as a practice and discipline to address all sociohumanitarian issues and complexities. From these limitations emerges an imperative to strengthen AI and ML literacy in underserved communities and build a more diverse AI and ML design and development workforce engaged in health research. Objective: AI and ML has the potential to account for and assess a variety of factors that contribute to health and disease and to improve prevention, diagnosis, and therapy. Here, we describe recent activities within the Artificial Intelligence/Machine Learning Consortium to Advance Health Equity and Researcher Diversity (AIM-AHEAD) Ethics and Equity Workgroup (EEWG) that led to the development of deliverables that will help put ethics and fairness at the forefront of AI and ML applications to build equity in biomedical research, education, and health care. Methods: The AIM-AHEAD EEWG was created in 2021 with 3 cochairs and 51 members in year 1 and 2 cochairs and {\textasciitilde}40 members in year 2. Members in both years included AIM-AHEAD principal investigators, coinvestigators, leadership fellows, and research fellows. The EEWG used a modified Delphi approach using polling, ranking, and other exercises to facilitate discussions around tangible steps, key terms, and definitions needed to ensure that ethics and fairness are at the forefront of AI and ML applications to build equity in biomedical research, education, and health care. Results: The EEWG developed a set of ethics and equity principles, a glossary, and an interview guide. The ethics and equity principles comprise 5 core principles, each with subparts, which articulate best practices for working with stakeholders from historically and presently underrepresented communities. The glossary contains 12 terms and definitions, with particular emphasis on optimal development, refinement, and implementation of AI and ML in health equity research. To accompany the glossary, the EEWG developed a concept relationship diagram that describes the logical flow of and relationship between the definitional concepts. Lastly, the interview guide provides questions that can be used or adapted to garner stakeholder and community perspectives on the principles and glossary. Conclusions: Ongoing engagement is needed around our principles and glossary to identify and predict potential limitations in their uses in AI and ML research settings, especially for institutions with limited resources. This requires time, careful consideration, and honest discussions around what classifies an engagement incentive as meaningful to support and sustain their full engagement. By slowing down to meet historically and presently underresourced institutions and communities where they are and where they are capable of engaging and competing, there is higher potential to achieve needed diversity, ethics, and equity in AI and ML implementation in health research. ", doi="10.2196/52888", url="https://ai.jmir.org/2023/1/e52888", url="http://www.ncbi.nlm.nih.gov/pubmed/38875540" } @Article{info:doi/10.2196/49531, author="Jamali, Akbar Ali and Berger, Corinne and Spiteri, J. Raymond", title="Momentary Depressive Feeling Detection Using X (Formerly Twitter) Data: Contextual Language Approach", journal="JMIR AI", year="2023", month="Nov", day="27", volume="2", pages="e49531", keywords="depression", keywords="momentary depressive feelings", keywords="X (Twitter)", keywords="natural language processing", keywords="lexicon", keywords="machine learning", keywords="transfer learning", abstract="Background: Depression and momentary depressive feelings are major public health concerns imposing a substantial burden on both individuals and society. Early detection of momentary depressive feelings is highly beneficial in reducing this burden and improving the quality of life for affected individuals. To this end, the abundance of data exemplified by X (formerly Twitter) presents an invaluable resource for discerning insights into individuals' mental states and enabling timely detection of these transitory depressive feelings. Objective: The objective of this study was to automate the detection of momentary depressive feelings in posts using contextual language approaches. Methods: First, we identified terms expressing momentary depressive feelings and depression, scaled their relevance to depression, and constructed a lexicon. Then, we scraped posts using this lexicon and labeled them manually. Finally, we assessed the performance of the Bidirectional Encoder Representations From Transformers (BERT), A Lite BERT (ALBERT), Robustly Optimized BERT Approach (RoBERTa), Distilled BERT (DistilBERT), convolutional neural network (CNN), bidirectional long short-term memory (BiLSTM), and machine learning (ML) algorithms in detecting momentary depressive feelings in posts. Results: This study demonstrates a notable distinction in performance between binary classification, aimed at identifying posts conveying depressive sentiments and multilabel classification, designed to categorize such posts across multiple emotional nuances. Specifically, binary classification emerges as the more adept approach in this context, outperforming multilabel classification. This outcome stems from several critical factors that underscore the nuanced nature of depressive expressions within social media. Our results show that when using binary classification, BERT and DistilBERT (pretrained transfer learning algorithms) may outperform traditional ML algorithms. Particularly, DistilBERT achieved the best performance in terms of area under the curve (96.71\%), accuracy (97.4\%), sensitivity (97.57\%), specificity (97.22\%), precision (97.30\%), and F1-score (97.44\%). DistilBERT obtained an area under the curve nearly 12\% points higher than that of the best-performing traditional ML algorithm, convolutional neural network. This study showed that transfer learning algorithms are highly effective in extracting knowledge from posts, detecting momentary depressive feelings, and highlighting their superiority in contextual analysis. Conclusions: Our findings suggest that contextual language approaches---particularly those rooted in transfer learning---are reliable approaches to automate the early detection of momentary depressive feelings and can be used to develop social media monitoring tools for identifying individuals who may be at risk of depression. The implications are far-reaching because these approaches stand poised to inform the creation of social media monitoring tools and are pivotal for identifying individuals susceptible to depression. By intervening proactively, these tools possess the potential to slow the progression of depressive feelings, effectively mitigating the societal load of depression and fostering improved mental health. In addition to highlighting the capabilities of automated sentiment analysis, this study illuminates its pivotal role in advancing global public health. ", doi="10.2196/49531", url="https://ai.jmir.org/2023/1/e49531", url="http://www.ncbi.nlm.nih.gov/pubmed/38875532" } @Article{info:doi/10.2196/46779, author="Ekpezu, Obu Akon and Wiafe, Isaac and Oinas-Kukkonen, Harri", title="Predicting Adherence to Behavior Change Support Systems Using Machine Learning: Systematic Review", journal="JMIR AI", year="2023", month="Nov", day="22", volume="2", pages="e46779", keywords="adherence", keywords="compliance", keywords="behavior change support systems", keywords="persuasive systems", keywords="persuasive technology", keywords="machine learning", abstract="Background: There is a dearth of knowledge on reliable adherence prediction measures in behavior change support systems (BCSSs). Existing reviews have predominately focused on self-reporting measures of adherence. These measures are susceptible to overestimation or underestimation of adherence behavior. Objective: This systematic review seeks to identify and summarize trends in the use of machine learning approaches to predict adherence to BCSSs. Methods: Systematic literature searches were conducted in the Scopus and PubMed electronic databases between January 2011 and August 2022. The initial search retrieved 2182 journal papers, but only 11 of these papers were eligible for this review. Results: A total of 4 categories of adherence problems in BCSSs were identified: adherence to digital cognitive and behavioral interventions, medication adherence, physical activity adherence, and diet adherence. The use of machine learning techniques for real-time adherence prediction in BCSSs is gaining research attention. A total of 13 unique supervised learning techniques were identified and the majority of them were traditional machine learning techniques (eg, support vector machine). Long short-term memory, multilayer perception, and ensemble learning are currently the only advanced learning techniques. Despite the heterogeneity in the feature selection approaches, most prediction models achieved good classification accuracies. This indicates that the features or predictors used were a good representation of the adherence problem. Conclusions: Using machine learning algorithms to predict the adherence behavior of a BCSS user can facilitate the reinforcement of adherence behavior. This can be achieved by developing intelligent BCSSs that can provide users with more personalized, tailored, and timely suggestions. ", doi="10.2196/46779", url="https://ai.jmir.org/2023/1/e46779" } @Article{info:doi/10.2196/45257, author="Lashen, Hazem and St John, Lee Terrence and Almallah, Zaki Y. and Sasidhar, Madhu and Shamout, E. Farah", title="Machine Learning Models Versus the National Early Warning Score System for Predicting Deterioration: Retrospective Cohort Study in the United Arab Emirates", journal="JMIR AI", year="2023", month="Nov", day="6", volume="2", pages="e45257", keywords="machine learning", keywords="early warning score system", keywords="clinical deterioration", keywords="early warning", keywords="score system", keywords="cohort", keywords="real-world data", keywords="neural network", keywords="predict", keywords="deterioration", abstract="Background: Early warning score systems are widely used for identifying patients who are at the highest risk of deterioration to assist clinical decision-making. This could facilitate early intervention and consequently improve patient outcomes; for example, the National Early Warning Score (NEWS) system, which is recommended by the Royal College of Physicians in the United Kingdom, uses predefined alerting thresholds to assign scores to patients based on their vital signs. However, there is limited evidence of the reliability of such scores across patient cohorts in the United Arab Emirates. Objective: Our aim in this study was to propose a data-driven model that accurately predicts in-hospital deterioration in an inpatient cohort in the United Arab Emirates. Methods: We conducted a retrospective cohort study using a real-world data set that consisted of 16,901 unique patients associated with 26,073 inpatient emergency encounters and 951,591 observation sets collected between April 2015 and August 2021 at a large multispecialty hospital in Abu Dhabi, United Arab Emirates. The observation sets included routine measurements of heart rate, respiratory rate, systolic blood pressure, level of consciousness, temperature, and oxygen saturation, as well as whether the patient was receiving supplementary oxygen. We divided the data set of 16,901 unique patients into training, validation, and test sets consisting of 11,830 (70\%; 18,319/26,073, 70.26\% emergency encounters), 3397 (20.1\%; 5206/26,073, 19.97\% emergency encounters), and 1674 (9.9\%; 2548/26,073, 9.77\% emergency encounters) patients, respectively. We defined an adverse event as the occurrence of admission to the intensive care unit, mortality, or both if the patient was admitted to the intensive care unit first. On the basis of 7 routine vital signs measurements, we assessed the performance of the NEWS system in detecting deterioration within 24 hours using the area under the receiver operating characteristic curve (AUROC). We also developed and evaluated several machine learning models, including logistic regression, a gradient-boosting model, and a feed-forward neural network. Results: In a holdout test set of 2548 encounters with 95,755 observation sets, the NEWS system achieved an overall AUROC value of 0.682 (95\% CI 0.673-0.690). In comparison, the best-performing machine learning models, which were the gradient-boosting model and the neural network, achieved AUROC values of 0.778 (95\% CI 0.770-0.785) and 0.756 (95\% CI 0.749-0.764), respectively. Our interpretability results highlight the importance of temperature and respiratory rate in predicting patient deterioration. Conclusions: Although traditional early warning score systems are the dominant form of deterioration prediction models in clinical practice today, we strongly recommend the development and use of cohort-specific machine learning models as an alternative. This is especially important in external patient cohorts that were unseen during model development. ", doi="10.2196/45257", url="https://ai.jmir.org/2023/1/e45257", url="http://www.ncbi.nlm.nih.gov/pubmed/38875543" } @Article{info:doi/10.2196/47353, author="Hummelsberger, Pia and Koch, K. Timo and Rauh, Sabrina and Dorn, Julia and Lermer, Eva and Raue, Martina and Hudecek, C. Matthias F. and Schicho, Andreas and Colak, Errol and Ghassemi, Marzyeh and Gaube, Susanne", title="Insights on the Current State and Future Outlook of AI in Health Care: Expert Interview Study", journal="JMIR AI", year="2023", month="Oct", day="31", volume="2", pages="e47353", keywords="artificial intelligence", keywords="AI", keywords="machine learning", keywords="health care", keywords="digital health technology", keywords="technology implementation", keywords="expert interviews", keywords="mixed methods", keywords="topic modeling", abstract="Background: Artificial intelligence (AI) is often promoted as a potential solution for many challenges health care systems face worldwide. However, its implementation in clinical practice lags behind its technological development. Objective: This study aims to gain insights into the current state and prospects of AI technology from the stakeholders most directly involved in its adoption in the health care sector whose perspectives have received limited attention in research to date. Methods: For this purpose, the perspectives of AI researchers and health care IT professionals in North America and Western Europe were collected and compared for profession-specific and regional differences. In this preregistered, mixed methods, cross-sectional study, 23 experts were interviewed using a semistructured guide. Data from the interviews were analyzed using deductive and inductive qualitative methods for the thematic analysis along with topic modeling to identify latent topics. Results: Through our thematic analysis, four major categories emerged: (1) the current state of AI systems in health care, (2) the criteria and requirements for implementing AI systems in health care, (3) the challenges in implementing AI systems in health care, and (4) the prospects of the technology. Experts discussed the capabilities and limitations of current AI systems in health care in addition to their prevalence and regional differences. Several criteria and requirements deemed necessary for the successful implementation of AI systems were identified, including the technology's performance and security, smooth system integration and human-AI interaction, costs, stakeholder involvement, and employee training. However, regulatory, logistical, and technical issues were identified as the most critical barriers to an effective technology implementation process. In the future, our experts predicted both various threats and many opportunities related to AI technology in the health care sector. Conclusions: Our work provides new insights into the current state, criteria, challenges, and outlook for implementing AI technology in health care from the perspective of AI researchers and IT professionals in North America and Western Europe. For the full potential of AI-enabled technologies to be exploited and for them to contribute to solving current health care challenges, critical implementation criteria must be met, and all groups involved in the process must work together. ", doi="10.2196/47353", url="https://ai.jmir.org/2023/1/e47353", url="http://www.ncbi.nlm.nih.gov/pubmed/38875571" } @Article{info:doi/10.2196/48340, author="Shi, Bohan and Dhaliwal, Singh Satvinder and Soo, Marcus and Chan, Cheri and Wong, Jocelin and Lam, C. Natalie W. and Zhou, Entong and Paitimusa, Vivien and Loke, Yin Kum and Chin, Joel and Chua, Tuan Mei and Liaw, Suan Kathy Chiew and Lim, H. Amos W. and Insyirah, Fatin Fadil and Yen, Shih-Cheng and Tay, Arthur and Ang, Bin Seng", title="Assessing Elevated Blood Glucose Levels Through Blood Glucose Evaluation and Monitoring Using Machine Learning and Wearable Photoplethysmography Sensors: Algorithm Development and Validation", journal="JMIR AI", year="2023", month="Oct", day="27", volume="2", pages="e48340", keywords="diabetes mellitus", keywords="explainable artificial intelligence", keywords="feature engineering", keywords="machine learning", keywords="photoplethysmography", keywords="wearable sensor", abstract="Background: Diabetes mellitus is the most challenging and fastest-growing global public health concern. Approximately 10.5\% of the global adult population is affected by diabetes, and almost half of them are undiagnosed. The growing at-risk population exacerbates the shortage of health resources, with an estimated 10.6\% and 6.2\% of adults worldwide having impaired glucose tolerance and impaired fasting glycemia, respectively. All current diabetes screening methods are invasive and opportunistic and must be conducted in a hospital or laboratory by trained professionals. At-risk participants might remain undetected for years and miss the precious time window for early intervention to prevent or delay the onset of diabetes and its complications. Objective: We aimed to develop an artificial intelligence solution to recognize elevated blood glucose levels (?7.8 mmol/L) noninvasively and evaluate diabetic risk based on repeated measurements. Methods: This study was conducted at KK Women's and Children's Hospital in Singapore, and 500 participants were recruited (mean age 38.73, SD 10.61 years; mean BMI 24.4, SD 5.1 kg/m2). The blood glucose levels for most participants were measured before and after consuming 75 g of sugary drinks using both a conventional glucometer (Accu-Chek Performa) and a wrist-worn wearable. The results obtained from the glucometer were used as ground-truth measurements. We performed extensive feature engineering on photoplethysmography (PPG) sensor data and identified features that were sensitive to glucose changes. These selected features were further analyzed using an explainable artificial intelligence approach to understand their contribution to our predictions. Results: Multiple machine learning models were trained and assessed with 10-fold cross-validation, using participant demographic data and critical features extracted from PPG measurements as predictors. A support vector machine with a radial basis function kernel had the best detection performance, with an average accuracy of 84.7\%, a sensitivity of 81.05\%, a specificity of 88.3\%, a precision of 87.51\%, a geometric mean of 84.54\%, and F score of 84.03\%. Conclusions: Our findings suggest that PPG measurements can be used to identify participants with elevated blood glucose measurements and assist in the screening of participants for diabetes risk. ", doi="10.2196/48340", url="https://ai.jmir.org/2023/1/e48340", url="http://www.ncbi.nlm.nih.gov/pubmed/38875549" } @Article{info:doi/10.2196/47223, author="Malgaroli, Matteo and Tseng, Emily and Hull, D. Thomas and Jennings, Emma and Choudhury, K. Tanzeem and Simon, M. Naomi", title="Association of Health Care Work With Anxiety and Depression During the COVID-19 Pandemic: Structural Topic Modeling Study", journal="JMIR AI", year="2023", month="Oct", day="24", volume="2", pages="e47223", keywords="depression", keywords="anxiety", keywords="health care workers", keywords="COVID-19", keywords="natural language processing", keywords="topic modeling", keywords="stressor", keywords="mental health", keywords="treatment", keywords="psychotherapy", keywords="digital health", abstract="Background: Stressors for health care workers (HCWs) during the COVID-19 pandemic have been manifold, with high levels of depression and anxiety alongside gaps in care. Identifying the factors most tied to HCWs' psychological challenges is crucial to addressing HCWs' mental health needs effectively, now and for future large-scale events. Objective: In this study, we used natural language processing methods to examine deidentified psychotherapy transcripts from telemedicine treatment during the initial wave of COVID-19 in the United States. Psychotherapy was delivered by licensed therapists while HCWs were managing increased clinical demands and elevated hospitalization rates, in addition to population-level social distancing measures and infection risks. Our goal was to identify specific concerns emerging in treatment for HCWs and to compare differences with matched non-HCW patients from the general population. Methods: We conducted a case-control study with a sample of 820 HCWs and 820 non-HCW matched controls who received digitally delivered psychotherapy in 49 US states in the spring of 2020 during the first US wave of the COVID-19 pandemic. Depression was measured during the initial assessment using the Patient Health Questionnaire-9, and anxiety was measured using the General Anxiety Disorder-7 questionnaire. Structural topic models (STMs) were used to determine treatment topics from deidentified transcripts from the first 3 weeks of treatment. STM effect estimators were also used to examine topic prevalence in patients with moderate to severe anxiety and depression. Results: The median treatment enrollment date was April 15, 2020 (IQR March 31 to April 27, 2020) for HCWs and April 19, 2020 (IQR April 5 to April 27, 2020) for matched controls. STM analysis of deidentified transcripts identified 4 treatment topics centered on health care and 5 on mental health for HCWs. For controls, 3 STM topics on pandemic-related disruptions and 5 on mental health were identified. Several STM treatment topics were significantly associated with moderate to severe anxiety and depression, including working on the hospital unit (topic prevalence 0.035, 95\% CI 0.022-0.048; P<.001), mood disturbances (prevalence 0.014, 95\% CI 0.002-0.026; P=.03), and sleep disturbances (prevalence 0.016, 95\% CI 0.002-0.030; P=.02). No significant associations emerged between pandemic-related topics and moderate to severe anxiety and depression for non-HCW controls. Conclusions: The study provides large-scale quantitative evidence that during the initial wave of the COVID-19 pandemic, HCWs faced unique work-related challenges and stressors associated with anxiety and depression, which required dedicated treatment efforts. The study further demonstrates how natural language processing methods have the potential to surface clinically relevant markers of distress while preserving patient privacy. ", doi="10.2196/47223", url="https://ai.jmir.org/2023/1/e47223", url="http://www.ncbi.nlm.nih.gov/pubmed/38875560" } @Article{info:doi/10.2196/48628, author="Kia, Arash and Waterson, James and Bargary, Norma and Rolt, Stuart and Burke, Kevin and Robertson, Jeremy and Garcia, Samuel and Benavoli, Alessio and Bergstr{\"o}m, David", title="Determinants of Intravenous Infusion Longevity and Infusion Failure via a Nonlinear Model Analysis of Smart Pump Event Logs: Retrospective Study", journal="JMIR AI", year="2023", month="Sep", day="13", volume="2", pages="e48628", keywords="intravenous infusion", keywords="vascular access device", keywords="alarm fatigue", keywords="intensive care units", keywords="intensive care", keywords="neonatal", keywords="predictive model", keywords="smart pump", keywords="smart device", keywords="health device", keywords="infusion", keywords="intravenous", keywords="nonlinear model", keywords="medical device", keywords="therapy", keywords="prediction model", keywords="artificial intelligence", keywords="AI", keywords="machine learning", keywords="predict", keywords="predictive", keywords="prediction", keywords="log data", keywords="event log", abstract="Background: Infusion failure may have severe consequences for patients receiving critical, short--half-life infusions. Continued interruptions to infusions can lead to subtherapeutic therapy. Objective: This study aims to identify and rank determinants of the longevity of continuous infusions administered through syringe drivers, using nonlinear predictive models. Additionally, this study aims to evaluate key factors influencing infusion longevity and develop and test a model for predicting the likelihood of achieving successful infusion longevity. Methods: Data were extracted from the event logs of smart pumps containing information on care profiles, medication types and concentrations, occlusion alarm settings, and the final infusion cessation cause. These data were then used to fit 5 nonlinear models and evaluate the best explanatory model. Results: Random forest was the best-fit predictor, with an F1-score of 80.42, compared to 5 other models (mean F1-score 75.06; range 67.48-79.63). When applied to infusion data in an individual syringe driver data set, the predictor model found that the final medication concentration and medication type were of less significance to infusion longevity compared to the rate and care unit. For low-rate infusions, rates ranging from 2 to 2.8 mL/hr performed best for achieving a balance between infusion longevity and fluid load per infusion, with an occlusion versus no-occlusion ratio of 0.553. Rates between 0.8 and 1.2 mL/hr exhibited the poorest performance with a ratio of 1.604. Higher rates, up to 4 mL/hr, performed better in terms of occlusion versus no-occlusion ratios. Conclusions: This study provides clinicians with insights into the specific types of infusion that warrant more intense observation or proactive management of intravenous access; additionally, it can offer valuable information regarding the average duration of uninterrupted infusions that can be expected in these care areas. Optimizing rate settings to improve infusion longevity for continuous infusions, achieved through compounding to create customized concentrations for individual patients, may be possible in light of the study's outcomes. The study also highlights the potential of machine learning nonlinear models in predicting outcomes and life spans of specific therapies delivered via medical devices. ", doi="10.2196/48628", url="https://ai.jmir.org/2023/1/e48628" } @Article{info:doi/10.2196/48123, author="Zouzos, Athanasios and Milovanovic, Aleksandra and Dembrower, Karin and Strand, Fredrik", title="Effect of Benign Biopsy Findings on an Artificial Intelligence--Based Cancer Detector in Screening Mammography: Retrospective Case-Control Study", journal="JMIR AI", year="2023", month="Aug", day="31", volume="2", pages="e48123", keywords="artificial intelligence", keywords="AI", keywords="mammography", keywords="breast cancer", keywords="benign biopsy", keywords="screening", keywords="cancer screening", keywords="diagnostic", keywords="radiology", keywords="detection system", abstract="Background: Artificial intelligence (AI)--based cancer detectors (CAD) for mammography are starting to be used for breast cancer screening in radiology departments. It is important to understand how AI CAD systems react to benign lesions, especially those that have been subjected to biopsy. Objective: Our goal was to corroborate the hypothesis that women with previous benign biopsy and cytology assessments would subsequently present increased AI CAD abnormality scores even though they remained healthy. Methods: This is a retrospective study applying a commercial AI CAD system (Insight MMG, version 1.1.4.3; Lunit Inc) to a cancer-enriched mammography screening data set of 10,889 women (median age 56, range 40-74 years). The AI CAD generated a continuous prediction score for tumor suspicion between 0.00 and 1.00, where 1.00 represented the highest level of suspicion. A binary read (flagged or not flagged) was defined on the basis of a predetermined cutoff threshold (0.40). The flagged median and proportion of AI scores were calculated for women who were healthy, those who had a benign biopsy finding, and those who were diagnosed with breast cancer. For women with a benign biopsy finding, the interval between mammography and the biopsy was used for stratification of AI scores. The effect of increasing age was examined using subgroup analysis and regression modeling. Results: Of a total of 10,889 women, 234 had a benign biopsy finding before or after screening. The proportions of flagged healthy women were 3.5\%, 11\%, and 84\% for healthy women without a benign biopsy finding, those with a benign biopsy finding, and women with breast cancer, respectively (P<.001). For the 8307 women with complete information, radiologist 1, radiologist 2, and the AI CAD system flagged 8.5\%, 6.8\%, and 8.5\% of examinations of women who had a prior benign biopsy finding. The AI score correlated only with increasing age of the women in the cancer group (P=.01). Conclusions: Compared to healthy women without a biopsy, the examined AI CAD system flagged a much larger proportion of women who had or would have a benign biopsy finding based on a radiologist's decision. However, the flagging rate was not higher than that for radiologists. Further research should be focused on training the AI CAD system taking prior biopsy information into account. ", doi="10.2196/48123", url="https://ai.jmir.org/2023/1/e48123", url="http://www.ncbi.nlm.nih.gov/pubmed/38875554" } @Article{info:doi/10.2196/42313, author="Casey, Edward Aaron and Ansari, Saba and Nakisa, Bahareh and Kelly, Blair and Brown, Pieta and Cooper, Paul and Muhammad, Imran and Livingstone, Steven and Reddy, Sandeep and Makinen, Ville-Petteri", title="Application of a Comprehensive Evaluation Framework to COVID-19 Studies: Systematic Review of Translational Aspects of Artificial Intelligence in Health Care", journal="JMIR AI", year="2023", month="Jul", day="6", volume="2", pages="e42313", keywords="artificial intelligence", keywords="health care", keywords="clinical translation", keywords="translational value", keywords="evaluation", keywords="capability", keywords="utility", keywords="adoption", keywords="COVID-19", keywords="AI application", keywords="health care AI", keywords="model validation", keywords="AI model", keywords="AI tools", abstract="Background: Despite immense progress in artificial intelligence (AI) models, there has been limited deployment in health care environments. The gap between potential and actual AI applications is likely due to the lack of translatability between controlled research environments (where these models are developed) and clinical environments for which the AI tools are ultimately intended. Objective: We previously developed the Translational Evaluation of Healthcare AI (TEHAI) framework to assess the translational value of AI models and to support successful transition to health care environments. In this study, we applied the TEHAI framework to the COVID-19 literature in order to assess how well translational topics are covered. Methods: A systematic literature search for COVID-19 AI studies published between December 2019 and December 2020 resulted in 3830 records. A subset of 102 (2.7\%) papers that passed the inclusion criteria was sampled for full review. The papers were assessed for translational value and descriptive data collected by 9 reviewers (each study was assessed by 2 reviewers). Evaluation scores and extracted data were compared by a third reviewer for resolution of discrepancies. The review process was conducted on the Covidence software platform. Results: We observed a significant trend for studies to attain high scores for technical capability but low scores for the areas essential for clinical translatability. Specific questions regarding external model validation, safety, nonmaleficence, and service adoption received failed scores in most studies. Conclusions: Using TEHAI, we identified notable gaps in how well translational topics of AI models are covered in the COVID-19 clinical sphere. These gaps in areas crucial for clinical translatability could, and should, be considered already at the model development stage to increase translatability into real COVID-19 health care environments. ", doi="10.2196/42313", url="https://ai.jmir.org/2023/1/e42313", url="http://www.ncbi.nlm.nih.gov/pubmed/37457747" } @Article{info:doi/10.2196/46487, author="Robinson, Renee and Liday, Cara and Lee, Sarah and Williams, C. Ishan and Wright, Melanie and An, Sungjoon and Nguyen, Elaine", title="Artificial Intelligence in Health Care---Understanding Patient Information Needs and Designing Comprehensible Transparency: Qualitative Study", journal="JMIR AI", year="2023", month="Jun", day="19", volume="2", pages="e46487", keywords="artificial intelligence", keywords="machine learning", keywords="diabetes", keywords="equipment safety", keywords="equipment design", keywords="health care", abstract="Background: Artificial intelligence (AI) is a branch of computer science that uses advanced computational methods, such as machine learning (ML), to calculate and predict health outcomes and address patient and provider health needs. While these technologies show great promise for improving health care, especially in diabetes management, there are usability and safety concerns for both patients and providers about the use of AI/ML in health care management. Objective: We aimed to support and ensure safe use of AI/ML technologies in health care; thus, the team worked to better understand (1) patient information and training needs, (2) the factors that influence patients' perceived value and trust in AI/ML health care applications, and (3) how best to support safe and appropriate use of AI/ML-enabled devices and applications among people living with diabetes. Methods: To understand general patient perspectives and information needs related to the use of AI/ML in health care, we conducted a series of focus groups (n=9) and interviews (n=3) with patients (n=41) and interviews with providers (n=6) in Alaska, Idaho, and Virginia. Grounded theory guided data gathering, synthesis, and analysis. Thematic content and constant comparison analysis were used to identify relevant themes and subthemes. Inductive approaches were used to link data to key concepts, including preferred patient-provider interactions and patient perceptions of trust, accuracy, value, assurances, and information transparency. Results: Key summary themes and recommendations focused on (1) patient preferences for AI/ML-enabled device and application information, (2) patient and provider AI/ML-related device and application training needs, (3) factors contributing to patient and provider trust in AI/ML-enabled devices and applications, and (4) AI/ML-related device and application functionality and safety considerations. A number of participants (patients and providers) made recommendations to improve device functionality to guide information and labeling mandates (eg, link to online video resources and provide access to 24/7 live in-person or virtual emergency support). Other patient recommendations included (1) providing access to practice devices, (2) providing connections to local supports and reputable community resources, and (3) simplifying the display and alert limits. Conclusions: Recommendations from both patients and providers could be used by federal oversight agencies to improve utilization of AI/ML monitoring of technology use in diabetes, improving device safety and efficacy. ", doi="10.2196/46487", url="https://ai.jmir.org/2023/1/e46487", url="http://www.ncbi.nlm.nih.gov/pubmed/38333424" } @Article{info:doi/10.2196/44191, author="Pongdee, Thanai and Larson, B. Nicholas and Divekar, Rohit and Bielinski, J. Suzette and Liu, Hongfang and Moon, Sungrim", title="Automated Identification of Aspirin-Exacerbated Respiratory Disease Using Natural Language Processing and Machine Learning: Algorithm Development and Evaluation Study", journal="JMIR AI", year="2023", month="Jun", day="12", volume="2", pages="e44191", keywords="aspirin exacerbated respiratory disease", keywords="natural language processing", keywords="electronic health record", keywords="identification", keywords="machine learning", keywords="aspirin", keywords="asthma", keywords="respiratory illness", keywords="artificial intelligence", keywords="natural language processing algorithm", abstract="Background: Aspirin-exacerbated respiratory disease (AERD) is an acquired inflammatory condition characterized by the presence of asthma, chronic rhinosinusitis with nasal polyposis, and respiratory hypersensitivity reactions on ingestion of aspirin or other nonsteroidal anti-inflammatory drugs (NSAIDs). Despite AERD having a classic constellation of symptoms, the diagnosis is often overlooked, with an average of greater than 10 years between the onset of symptoms and diagnosis of AERD. Without a diagnosis, individuals will lack opportunities to receive effective treatments, such as aspirin desensitization or biologic medications. Objective: Our aim was to develop a combined algorithm that integrates both natural language processing (NLP) and machine learning (ML) techniques to identify patients with AERD from an electronic health record (EHR). Methods: A rule-based decision tree algorithm incorporating NLP-based features was developed using clinical documents from the EHR at Mayo Clinic. From clinical notes, using NLP techniques, 7 features were extracted that included the following: AERD, asthma, NSAID allergy, nasal polyps, chronic sinusitis, elevated urine leukotriene E4 level, and documented no-NSAID allergy. MedTagger was used to extract these 7 features from the unstructured clinical text given a set of keywords and patterns based on the chart review of 2 allergy and immunology experts for AERD. The status of each extracted feature was quantified by assigning the frequency of its occurrence in clinical documents per subject. We optimized the decision tree classifier's hyperparameters cutoff threshold on the training set to determine the representative feature combination to discriminate AERD. We then evaluated the resulting model on the test set. Results: The AERD algorithm, which combines NLP and ML techniques, achieved an area under the receiver operating characteristic curve score, sensitivity, and specificity of 0.86 (95\% CI 0.78-0.94), 80.00 (95\% CI 70.82-87.33), and 88.00 (95\% CI 79.98-93.64) for the test set, respectively. Conclusions: We developed a promising AERD algorithm that needs further refinement to improve AERD diagnosis. Continued development of NLP and ML technologies has the potential to reduce diagnostic delays for AERD and improve the health of our patients. ", doi="10.2196/44191", url="https://ai.jmir.org/2023/1/e44191" } @Article{info:doi/10.2196/44835, author="Dolatabadi, Elham and Chen, Branson and Buchan, A. Sarah and Austin, Marchand Alex and Azimaee, Mahmoud and McGeer, Allison and Mubareka, Samira and Kwong, C. Jeffrey", title="Natural Language Processing for Clinical Laboratory Data Repository Systems: Implementation and Evaluation for Respiratory Viruses", journal="JMIR AI", year="2023", month="Jun", day="6", volume="2", pages="e44835", keywords="health", keywords="informatics", keywords="natural language processing", keywords="knowledge extraction", keywords="electronic health record", keywords="EHR", abstract="Background: With the growing volume and complexity of laboratory repositories, it has become tedious to parse unstructured data into structured and tabulated formats for secondary uses such as decision support, quality assurance, and outcome analysis. However, advances in natural language processing (NLP) approaches have enabled efficient and automated extraction of clinically meaningful medical concepts from unstructured reports. Objective: In this study, we aimed to determine the feasibility of using the NLP model for information extraction as an alternative approach to a time-consuming and operationally resource-intensive handcrafted rule-based tool. Therefore, we sought to develop and evaluate a deep learning--based NLP model to derive knowledge and extract information from text-based laboratory reports sourced from a provincial laboratory repository system. Methods: The NLP model, a hierarchical multilabel classifier, was trained on a corpus of laboratory reports covering testing for 14 different respiratory viruses and viral subtypes. The corpus includes 87,500 unique laboratory reports annotated by 8 subject matter experts (SMEs). The classification task involved assigning the laboratory reports to labels at 2 levels: 24 fine-grained labels in level 1 and 6 coarse-grained labels in level 2. A ``label'' also refers to the status of a specific virus or strain being tested or detected (eg, influenza A is detected). The model's performance stability and variation were analyzed across all labels in the classification task. Additionally, the model's generalizability was evaluated internally and externally on various test sets. Results: Overall, the NLP model performed well on internal, out-of-time (pre--COVID-19), and external (different laboratories) test sets with microaveraged F1-scores >94\% across all classes. Higher precision and recall scores with less variability were observed for the internal and pre--COVID-19 test sets. As expected, the model's performance varied across categories and virus types due to the imbalanced nature of the corpus and sample sizes per class. There were intrinsically fewer classes of viruses being detected than those tested; therefore, the model's performance (lowest F1-score of 57\%) was noticeably lower in the detected cases. Conclusions: We demonstrated that deep learning--based NLP models are promising solutions for information extraction from text-based laboratory reports. These approaches enable scalable, timely, and practical access to high-quality and encoded laboratory data if integrated into laboratory information system repositories. ", doi="10.2196/44835", url="https://ai.jmir.org/2023/1/e44835", url="http://www.ncbi.nlm.nih.gov/pubmed/38875570" } @Article{info:doi/10.2196/45032, author="Liaw, R. Winston and Ramos Silva, Yessenia and Soltero, G. Erica and Krist, Alex and Stotts, L. Angela", title="An Assessment of How Clinicians and Staff Members Use a Diabetes Artificial Intelligence Prediction Tool: Mixed Methods Study", journal="JMIR AI", year="2023", month="May", day="29", volume="2", pages="e45032", keywords="artificial intelligence", keywords="medical informatics", keywords="qualitative research", keywords="prediction tool", keywords="clinicians", keywords="diabetes", keywords="treatment", keywords="clinical decision support", keywords="decision-making", keywords="survey", keywords="interview", keywords="usefulness", keywords="implementation", keywords="validation", keywords="design", keywords="usability", abstract="Background: Nearly one-third of patients with diabetes are poorly controlled (hemoglobin A1c?9\%). Identifying at-risk individuals and providing them with effective treatment is an important strategy for preventing poor control. Objective: This study aims to assess how clinicians and staff members would use a clinical decision support tool based on artificial intelligence (AI) and identify factors that affect adoption. Methods: This was a mixed methods study that combined semistructured interviews and surveys to assess the perceived usefulness and ease of use, intent to use, and factors affecting tool adoption. We recruited clinicians and staff members from practices that manage diabetes. During the interviews, participants reviewed a sample electronic health record alert and were informed that the tool uses AI to identify those at high risk for poor control. Participants discussed how they would use the tool, whether it would contribute to care, and the factors affecting its implementation. In a survey, participants reported their demographics; rank-ordered factors influencing the adoption of the tool; and reported their perception of the tool's usefulness as well as their intent to use, ease of use, and organizational support for use. Qualitative data were analyzed using a thematic content analysis approach. We used descriptive statistics to report demographics and analyze the findings of the survey. Results: In total, 22 individuals participated in the study. Two-thirds (14/22, 63\%) of respondents were physicians. Overall, 36\% (8/22) of respondents worked in academic health centers, whereas 27\% (6/22) of respondents worked in federally qualified health centers. The interviews identified several themes: this tool has the potential to be useful because it provides information that is not currently available and can make care more efficient and effective; clinicians and staff members were concerned about how the tool affects patient-oriented outcomes and clinical workflows; adoption of the tool is dependent on its validation, transparency, actionability, and design and could be increased with changes to the interface and usability; and implementation would require buy-in and need to be tailored to the demands and resources of clinics and communities. Survey findings supported these themes, as 77\% (17/22) of participants somewhat, moderately, or strongly agreed that they would use the tool, whereas these figures were 82\% (18/22) for usefulness, 82\% (18/22) for ease of use, and 68\% (15/22) for clinic support. The 2 highest ranked factors affecting adoption were whether the tool improves health and the accuracy of the tool. Conclusions: Most participants found the tool to be easy to use and useful, although they had concerns about alert fatigue, bias, and transparency. These data will be used to enhance the design of an AI tool. ", doi="10.2196/45032", url="https://ai.jmir.org/2023/1/e45032", url="http://www.ncbi.nlm.nih.gov/pubmed/38875578" } @Article{info:doi/10.2196/45450, author="Chan, Berin Nicholas and Li, Weizi and Aung, Theingi and Bazuaye, Eghosa and Montero, M. Rosa", title="Machine Learning--Based Time in Patterns for Blood Glucose Fluctuation Pattern Recognition in Type 1 Diabetes Management: Development and Validation Study", journal="JMIR AI", year="2023", month="May", day="26", volume="2", pages="e45450", keywords="diabetes mellitus", keywords="continuous glucose monitoring", keywords="glycemic variability", keywords="glucose fluctuation pattern", keywords="temporal clustering", keywords="scalable metrics", abstract="Background: Continuous glucose monitoring (CGM) for diabetes combines noninvasive glucose biosensors, continuous monitoring, cloud computing, and analytics to connect and simulate a hospital setting in a person's home. CGM systems inspired analytics methods to measure glycemic variability (GV), but existing GV analytics methods disregard glucose trends and patterns; hence, they fail to capture entire temporal patterns and do not provide granular insights about glucose fluctuations. Objective: This study aimed to propose a machine learning--based framework for blood glucose fluctuation pattern recognition, which enables a more comprehensive representation of GV profiles that could present detailed fluctuation information, be easily understood by clinicians, and provide insights about patient groups based on time in blood fluctuation patterns. Methods: Overall, 1.5 million measurements from 126 patients in the United Kingdom with type 1 diabetes mellitus (T1DM) were collected, and prevalent blood fluctuation patterns were extracted using dynamic time warping. The patterns were further validated in 225 patients in the United States with T1DM. Hierarchical clustering was then applied on time in patterns to form 4 clusters of patients. Patient groups were compared using statistical analysis. Results: In total, 6 patterns depicting distinctive glucose levels and trends were identified and validated, based on which 4 GV profiles of patients with T1DM were found. They were significantly different in terms of glycemic statuses such as diabetes duration (P=.04), glycated hemoglobin level (P<.001), and time in range (P<.001) and thus had different management needs. Conclusions: The proposed method can analytically extract existing blood fluctuation patterns from CGM data. Thus, time in patterns can capture a rich view of patients' GV profile. Its conceptual resemblance with time in range, along with rich blood fluctuation details, makes it more scalable, accessible, and informative to clinicians. ", doi="10.2196/45450", url="https://ai.jmir.org/2023/1/e45450" } @Article{info:doi/10.2196/44779, author="Naseri, Hossein and Skamene, Sonia and Tolba, Marwan and Faye, Daro Mame and Ramia, Paul and Khriguian, Julia and David, Marc and Kildea, John", title="A Scalable Radiomics- and Natural Language Processing--Based Machine Learning Pipeline to Distinguish Between Painful and Painless Thoracic Spinal Bone Metastases: Retrospective Algorithm Development and Validation Study", journal="JMIR AI", year="2023", month="May", day="22", volume="2", pages="e44779", keywords="cancer", keywords="pain", keywords="palliative care", keywords="radiotherapy", keywords="bone metastases", keywords="radiomics", keywords="natural language processing", keywords="machine learning", keywords="artificial intelligent", keywords="radiation therapy", abstract="Background: The identification of objective pain biomarkers can contribute to an improved understanding of pain, as well as its prognosis and better management. Hence, it has the potential to improve the quality of life of patients with cancer. Artificial intelligence can aid in the extraction of objective pain biomarkers for patients with cancer with bone metastases (BMs). Objective: This study aimed to develop and evaluate a scalable natural language processing (NLP)-- and radiomics-based machine learning pipeline to differentiate between painless and painful BM lesions in simulation computed tomography (CT) images using imaging features (biomarkers) extracted from lesion center point--based regions of interest (ROIs). Methods: Patients treated at our comprehensive cancer center who received palliative radiotherapy for thoracic spine BM between January 2016 and September 2019 were included in this retrospective study. Physician-reported pain scores were extracted automatically from radiation oncology consultation notes using an NLP pipeline. BM center points were manually pinpointed on CT images by radiation oncologists. Nested ROIs with various diameters were automatically delineated around these expert-identified BM center points, and radiomics features were extracted from each ROI. Synthetic Minority Oversampling Technique resampling, the Least Absolute Shrinkage And Selection Operator feature selection method, and various machine learning classifiers were evaluated using precision, recall, F1-score, and area under the receiver operating characteristic curve. Results: Radiation therapy consultation notes and simulation CT images of 176 patients (mean age 66, SD 14 years; 95 males) with thoracic spine BM were included in this study. After BM center point identification, 107 radiomics features were extracted from each spherical ROI using pyradiomics. Data were divided into 70\% and 30\% training and hold-out test sets, respectively. In the test set, the accuracy, sensitivity, specificity, and area under the receiver operating characteristic curve of our best performing model (neural network classifier on an ensemble ROI) were 0.82 (132/163), 0.59 (16/27), 0.85 (116/136), and 0.83, respectively. Conclusions: Our NLP- and radiomics-based machine learning pipeline was successful in differentiating between painful and painless BM lesions. It is intrinsically scalable by using NLP to extract pain scores from clinical notes and by requiring only center points to identify BM lesions in CT images. ", doi="10.2196/44779", url="https://ai.jmir.org/2023/1/e44779", url="http://www.ncbi.nlm.nih.gov/pubmed/38875572" } @Article{info:doi/10.2196/44432, author="Ogbechie, Matthew-David and Fischer Walker, Christa and Lee, Mu-Tien and Abba Gana, Amina and Oduola, Abimbola and Idemudia, Augustine and Edor, Matthew and Harris, Lark Emily and Stephens, Jessica and Gao, Xiaoming and Chen, Pai-Lien and Persaud, Etwaroo Navindra", title="Predicting Treatment Interruption Among People Living With HIV in Nigeria: Machine Learning Approach", journal="JMIR AI", year="2023", month="May", day="12", volume="2", pages="e44432", keywords="HIV", keywords="machine learning", keywords="treatment interruption", keywords="Nigeria", keywords="chronic disease", keywords="antiretroviral therapy", keywords="HIV program", keywords="intervention", keywords="data collection", abstract="Background: Antiretroviral therapy (ART) has transformed HIV from a fatal illness to a chronic disease. Given the high rate of treatment interruptions, HIV programs use a range of approaches to support individuals in adhering to ART and in re-engaging those who interrupt treatment. These interventions can often be time-consuming and costly, and thus providing for all may not be sustainable. Objective: This study aims to describe our experiences developing a machine learning (ML) model to predict interruption in treatment (IIT) at 30 days among people living with HIV newly enrolled on ART in Nigeria and our integration of the model into the routine information system. In addition, we collected health workers' perceptions and use of the model's outputs for case management. Methods: Routine program data collected from January 2005 through February 2021 was used to train and test an ML model (boosting tree and Extreme Gradient Boosting) to predict future IIT. Data were randomly sampled using an 80/20 split into training and test data sets, respectively. Model performance was estimated using sensitivity, specificity, and positive and negative predictive values. Variables considered to be highly associated with treatment interruption were preselected by a group of HIV prevention researchers, program experts, and biostatisticians for inclusion in the model. Individuals were defined as having IIT if they were provided a 30-day supply of antiretrovirals but did not return for a refill within 28 days of their scheduled follow-up visit date. Outputs from the ML model were shared weekly with health care workers at selected facilities. Results: After data cleaning, complete data for 136,747 clients were used for the analysis. The percentage of IIT cases decreased from 58.6\% (36,663/61,864) before 2017 to 14.2\% (3690/28,046) from October 2019 through February 2021. Overall IIT was higher among clients who were sicker at enrollment. Other factors that were significantly associated with IIT included pregnancy and breastfeeding status and facility characteristics (location, service level, and service type). Several models were initially developed; the selected model had a sensitivity of 81\%, specificity of 88\%, positive predictive value of 83\%, and negative predictive value of 87\%, and was successfully integrated into the national electronic medical records database. During field-testing, the majority of users reported that an IIT prediction tool could lead to proactive steps for preventing IIT and improving patient outcomes. Conclusions: High-performing ML models to identify patients with HIV at risk of IIT can be developed using routinely collected service delivery data and integrated into routine health management information systems. Machine learning can improve the targeting of interventions through differentiated models of care before patients interrupt treatment, resulting in increased cost-effectiveness and improved patient outcomes. ", doi="10.2196/44432", url="https://ai.jmir.org/2023/1/e44432", url="http://www.ncbi.nlm.nih.gov/pubmed/38875546" } @Article{info:doi/10.2196/41205, author="Owen, David and Antypas, Dimosthenis and Hassoulas, Athanasios and Pardi{\~n}as, F. Antonio and Espinosa-Anke, Luis and Collados, Camacho Jose", title="Enabling Early Health Care Intervention by Detecting Depression in Users of Web-Based Forums using Language Models: Longitudinal Analysis and Evaluation", journal="JMIR AI", year="2023", month="Mar", day="24", volume="2", pages="e41205", keywords="mental health", keywords="depression", keywords="internet", keywords="natural language processing", keywords="transformers", keywords="language models", keywords="sentiment", abstract="Background: Major depressive disorder is a common mental disorder affecting 5\% of adults worldwide. Early contact with health care services is critical for achieving accurate diagnosis and improving patient outcomes. Key symptoms of major depressive disorder (depression hereafter) such as cognitive distortions are observed in verbal communication, which can also manifest in the structure of written language. Thus, the automatic analysis of text outputs may provide opportunities for early intervention in settings where written communication is rich and regular, such as social media and web-based forums. Objective: The objective of this study was 2-fold. We sought to gauge the effectiveness of different machine learning approaches to identify users of the mass web-based forum Reddit, who eventually disclose a diagnosis of depression. We then aimed to determine whether the time between a forum post and a depression diagnosis date was a relevant factor in performing this detection. Methods: A total of 2 Reddit data sets containing posts belonging to users with and without a history of depression diagnosis were obtained. The intersection of these data sets provided users with an estimated date of depression diagnosis. This derived data set was used as an input for several machine learning classifiers, including transformer-based language models (LMs). Results: Bidirectional Encoder Representations from Transformers (BERT) and MentalBERT transformer-based LMs proved the most effective in distinguishing forum users with a known depression diagnosis from those without. They each obtained a mean F1-score of 0.64 across the experimental setups used for binary classification. The results also suggested that the final 12 to 16 weeks (about 3-4 months) of posts before a depressed user's estimated diagnosis date are the most indicative of their illness, with data before that period not helping the models detect more accurately. Furthermore, in the 4- to 8-week period before the user's estimated diagnosis date, their posts exhibited more negative sentiment than any other 4-week period in their post history. Conclusions: Transformer-based LMs may be used on data from web-based social media forums to identify users at risk for psychiatric conditions such as depression. Language features picked up by these classifiers might predate depression onset by weeks to months, enabling proactive mental health care interventions to support those at risk for this condition. ", doi="10.2196/41205", url="https://ai.jmir.org/2023/1/e41205", url="http://www.ncbi.nlm.nih.gov/pubmed/37525646" } @Article{info:doi/10.2196/40973, author="Jeyakumar, Tharshini and Younus, Sarah and Zhang, Melody and Clare, Megan and Charow, Rebecca and Karsan, Inaara and Dhalla, Azra and Al-Mouaswas, Dalia and Scandiffio, Jillian and Aling, Justin and Salhia, Mohammad and Lalani, Nadim and Overholt, Scott and Wiljer, David", title="Preparing for an Artificial Intelligence--Enabled Future: Patient Perspectives on Engagement and Health Care Professional Training for Adopting Artificial Intelligence Technologies in Health Care Settings", journal="JMIR AI", year="2023", month="Mar", day="2", volume="2", pages="e40973", keywords="artificial intelligence", keywords="patient", keywords="education", keywords="attitude", keywords="health data", keywords="adoption", keywords="health equity", keywords="patient engagement", abstract="Background: As new technologies emerge, there is a significant shift in the way care is delivered on a global scale. Artificial intelligence (AI) technologies have been rapidly and inexorably used to optimize patient outcomes, reduce health system costs, improve workflow efficiency, and enhance population health. Despite the widespread adoption of AI technologies, the literature on patient engagement and their perspectives on how AI will affect clinical care is scarce. Minimal patient engagement can limit the optimization of these novel technologies and contribute to suboptimal use in care settings. Objective: We aimed to explore patients' views on what skills they believe health care professionals should have in preparation for this AI-enabled future and how we can better engage patients when adopting and deploying AI technologies in health care settings. Methods: Semistructured interviews were conducted from August 2020 to December 2021 with 12 individuals who were a patient in any Canadian health care setting. Interviews were conducted until thematic saturation occurred. A thematic analysis approach outlined by Braun and Clarke was used to inductively analyze the data and identify overarching themes. Results: Among the 12 patients interviewed, 8 (67\%) were from urban settings and 4 (33\%) were from rural settings. A majority of the participants were very comfortable with technology (n=6, 50\%) and somewhat familiar with AI (n=7, 58\%). In total, 3 themes emerged: cultivating patients' trust, fostering patient engagement, and establishing data governance and validation of AI technologies. Conclusions: With the rapid surge of AI solutions, there is a critical need to understand patient values in advancing the quality of care and contributing to an equitable health system. Our study demonstrated that health care professionals play a synergetic role in the future of AI and digital technologies. Patient engagement is vital in addressing underlying health inequities and fostering an optimal care experience. Future research is warranted to understand and capture the diverse perspectives of patients with various racial, ethnic, and socioeconomic backgrounds. ", doi="10.2196/40973", url="https://ai.jmir.org/2023/1/e40973", url="http://www.ncbi.nlm.nih.gov/pubmed/38875561" } @Article{info:doi/10.2196/42936, author="Berdahl, Thomas Carl and Baker, Lawrence and Mann, Sean and Osoba, Osonde and Girosi, Federico", title="Strategies to Improve the Impact of Artificial Intelligence on Health Equity: Scoping Review", journal="JMIR AI", year="2023", month="Feb", day="7", volume="2", pages="e42936", keywords="artificial intelligence", keywords="machine learning", keywords="health equity", keywords="health care disparities", keywords="algorithmic bias", keywords="social determinants of health", keywords="decision making", keywords="algorithms", keywords="gray literature", keywords="equity", keywords="health data", abstract="Background: Emerging artificial intelligence (AI) applications have the potential to improve health, but they may also perpetuate or exacerbate inequities. Objective: This review aims to provide a comprehensive overview of the health equity issues related to the use of AI applications and identify strategies proposed to address them. Methods: We searched PubMed, Web of Science, the IEEE (Institute of Electrical and Electronics Engineers) Xplore Digital Library, ProQuest U.S. Newsstream, Academic Search Complete, the Food and Drug Administration (FDA) website, and ClinicalTrials.gov to identify academic and gray literature related to AI and health equity that were published between 2014 and 2021 and additional literature related to AI and health equity during the COVID-19 pandemic from 2020 and 2021. Literature was eligible for inclusion in our review if it identified at least one equity issue and a corresponding strategy to address it. To organize and synthesize equity issues, we adopted a 4-step AI application framework: Background Context, Data Characteristics, Model Design, and Deployment. We then created a many-to-many mapping of the links between issues and strategies. Results: In 660 documents, we identified 18 equity issues and 15 strategies to address them. Equity issues related to Data Characteristics and Model Design were the most common. The most common strategies recommended to improve equity were improving the quantity and quality of data, evaluating the disparities introduced by an application, increasing model reporting and transparency, involving the broader community in AI application development, and improving governance. Conclusions: Stakeholders should review our many-to-many mapping of equity issues and strategies when planning, developing, and implementing AI applications in health care so that they can make appropriate plans to ensure equity for populations affected by their products. AI application developers should consider adopting equity-focused checklists, and regulators such as the FDA should consider requiring them. Given that our review was limited to documents published online, developers may have unpublished knowledge of additional issues and strategies that we were unable to identify. ", doi="10.2196/42936", url="https://ai.jmir.org/2023/1/e42936" } @Article{info:doi/10.2196/38397, author="Wu, Hao and Lu, Xiaoyu and Wang, Hanyu", title="The Application of Artificial Intelligence in Health Care Resource Allocation Before and During the COVID-19 Pandemic: Scoping Review", journal="JMIR AI", year="2023", month="Jan", day="30", volume="2", pages="e38397", keywords="artificial intelligence", keywords="resource distribution", keywords="health care", keywords="COVID-19", keywords="health equality", keywords="eHealth", keywords="digital health", abstract="Background: Imbalanced health care resource distribution has been central to unequal health outcomes and political tension around the world. Artificial intelligence (AI) has emerged as a promising tool for facilitating resource distribution, especially during emergencies. However, no comprehensive review exists on the use and ethics of AI in health care resource distribution. Objective: This study aims to conduct a scoping review of the application of AI in health care resource distribution, and explore the ethical and political issues in such situations. Methods: A scoping review was conducted following the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews). A comprehensive search of relevant literature was conducted in MEDLINE (Ovid), PubMed, Web of Science, and Embase from inception to February 2022. The review included qualitative and quantitative studies investigating the application of AI in health care resource allocation. Results: The review involved 22 articles, including 9 on model development and 13 on theoretical discussions, qualitative studies, or review studies. Of the 9 on model development and validation, 5 were conducted in emerging economies, 3 in developed countries, and 1 in a global context. In terms of content, 4 focused on resource distribution at the health system level and 5 focused on resource allocation at the hospital level. Of the 13 qualitative studies, 8 were discussions on the COVID-19 pandemic and the rest were on hospital resources, outbreaks, screening, human resources, and digitalization. Conclusions: This scoping review synthesized evidence on AI in health resource distribution, focusing on the COVID-19 pandemic. The results suggest that the application of AI has the potential to improve efficacy in resource distribution, especially during emergencies. Efficient data sharing and collecting structures are needed to make reliable and evidence-based decisions. Health inequality, distributive justice, and transparency must be considered when deploying AI models in real-world situations. ", doi="10.2196/38397", url="https://ai.jmir.org/2023/1/e38397", url="http://www.ncbi.nlm.nih.gov/pubmed/27917920" } @Article{info:doi/10.2196/37751, author="Liu, Xiaoyu and Alsghaier, Hiba and Tong, Ling and Ataullah, Amna and McRoy, Susan", title="Visualizing the Interpretation of a Criteria-Driven System That Automatically Evaluates the Quality of Health News: Exploratory Study of 2 Approaches", journal="JMIR AI", year="2022", month="Dec", day="20", volume="1", number="1", pages="e37751", keywords="health misinformation", keywords="machine learning", keywords="local interpretable model-agnostic explanation", keywords="LIME", keywords="interpretable artificial intelligence", keywords="AI", abstract="Background: Machine learning techniques have been shown to be efficient in identifying health misinformation, but the results may not be trusted unless they can be justified in a way that is understandable. Objective: This study aimed to provide a new criteria-based system to assess and justify health news quality. Using a subset of an existing set of criteria, this study compared the feasibility of 2 alternative methods for adding interpretability. Both methods used classification and highlighting to visualize sentence-level evidence. Methods: A total of 3 out of 10 well-established criteria were chosen for experimentation, namely whether the health news discussed the costs of the intervention (the cost criterion), explained or quantified the harms of the intervention (the harm criterion), and identified the conflicts of interest (the conflict criterion). The first step of the experiment was to automate the evaluation of the 3 criteria by developing a sentence-level classifier. We tested Logistic Regression, Naive Bayes, Support Vector Machine, and Random Forest algorithms. Next, we compared the 2 visualization approaches. For the first approach, we calculated word feature weights, which explained how classification models distill keywords that contribute to the prediction; then, using the local interpretable model-agnostic explanation framework, we selected keywords associated with the classified criterion at the document level; and finally, the system selected and highlighted sentences with keywords. For the second approach, we extracted sentences that provided evidence to support the evaluation result from 100 health news articles; based on these results, we trained a typology classification model at the sentence level; and then, the system highlighted a positive sentence instance for the result justification. The number of sentences to highlight was determined by a preset threshold empirically determined using the average accuracy. Results: The automatic evaluation of health news on the cost, harm, and conflict criteria achieved average area under the curve scores of 0.88, 0.76, and 0.73, respectively, after 50 repetitions of 10-fold cross-validation. We found that both approaches could successfully visualize the interpretation of the system but that the performance of the 2 approaches varied by criterion and highlighting the accuracy decreased as the number of highlighted sentences increased. When the threshold accuracy was ?75\%, this resulted in a visualization with a variable length ranging from 1 to 6 sentences. Conclusions: We provided 2 approaches to interpret criteria-based health news evaluation models tested on 3 criteria. This method incorporated rule-based and statistical machine learning approaches. The results suggested that one might visually interpret an automatic criterion-based health news quality evaluation successfully using either approach; however, larger differences may arise when multiple quality-related criteria are considered. This study can increase public trust in computerized health information evaluation. ", doi="10.2196/37751", url="https://ai.jmir.org/2022/1/e37751", url="http://www.ncbi.nlm.nih.gov/pubmed/38875559" } @Article{info:doi/10.2196/38171, author="Vertsberger, Dana and Naor, Navot and Winsberg, Mir{\`e}ne", title="Adolescents' Well-being While Using a Mobile Artificial Intelligence--Powered Acceptance Commitment Therapy Tool: Evidence From a Longitudinal Study", journal="JMIR AI", year="2022", month="Nov", day="29", volume="1", number="1", pages="e38171", keywords="well-being", keywords="adolescents", keywords="chatbots", keywords="conversational agents", keywords="mental health", keywords="mobile mental health", keywords="automated", keywords="support", keywords="self-management", keywords="self-help", keywords="smartphone", keywords="psychology", keywords="intervention", keywords="psychological", keywords="therapy", keywords="acceptance", keywords="commitment", keywords="engagement", abstract="Background: Adolescence is a critical developmental period to prevent and treat the emergence of mental health problems. Smartphone-based conversational agents can deliver psychologically driven intervention and support, thus increasing psychological well-being over time. Objective: The objective of the study was to test the potential of an automated conversational agent named Kai.ai to deliver a self-help program based on Acceptance Commitment Therapy tools for adolescents, aimed to increase their well-being. Methods: Participants were 10,387 adolescents, aged 14-18 years, who used Kai.ai on one of the top messaging apps (eg, iMessage and WhatsApp). Users' well-being levels were assessed between 2 and 5 times using the 5-item World Health Organization Well-being Index questionnaire over their engagement with the service. Results: Users engaged with the conversational agent an average of 45.39 (SD 46.77) days. The average well-being score at time point 1 was 39.28 (SD 18.17), indicating that, on average, users experienced reduced well-being. Latent growth curve modeling indicated that participants' well-being significantly increased over time ($\beta$=2.49; P<.001) and reached a clinically acceptable well-being average score (above 50). Conclusions: Mobile-based conversational agents have the potential to deliver engaging and effective Acceptance Commitment Therapy interventions. ", doi="10.2196/38171", url="https://ai.jmir.org/2022/1/e38171" } @Article{info:doi/10.2196/41940, author="Barry, Barbara and Zhu, Xuan and Behnken, Emma and Inselman, Jonathan and Schaepe, Karen and McCoy, Rozalina and Rushlow, David and Noseworthy, Peter and Richardson, Jordan and Curtis, Susan and Sharp, Richard and Misra, Artika and Akfaly, Abdulla and Molling, Paul and Bernard, Matthew and Yao, Xiaoxi", title="Provider Perspectives on Artificial Intelligence--Guided Screening for Low Ejection Fraction in Primary Care: Qualitative Study", journal="JMIR AI", year="2022", month="Oct", day="14", volume="1", number="1", pages="e41940", keywords="artificial intelligence", keywords="AI", keywords="machine learning", keywords="human-AI interaction", keywords="health informatics", keywords="primary care", keywords="cardiology", keywords="pragmatic clinical trial", keywords="AI-enabled clinical decision support", keywords="human-computer interaction", keywords="health care delivery", keywords="clinical decision support", keywords="health care", keywords="AI tools", abstract="Background: The promise of artificial intelligence (AI) to transform health care is threatened by a tangle of challenges that emerge as new AI tools are introduced into clinical practice. AI tools with high accuracy, especially those that detect asymptomatic cases, may be hindered by barriers to adoption. Understanding provider needs and concerns is critical to inform implementation strategies that improve provider buy-in and adoption of AI tools in medicine. Objective: This study aimed to describe provider perspectives on the adoption of an AI-enabled screening tool in primary care to inform effective integration and sustained use. Methods: A qualitative study was conducted between December 2019 and February 2020 as part of a pragmatic randomized controlled trial at a large academic medical center in the United States. In all, 29 primary care providers were purposively sampled using a positive deviance approach for participation in semistructured focus groups after their use of the AI tool in the randomized controlled trial was complete. Focus group data were analyzed using a grounded theory approach; iterative analysis was conducted to identify codes and themes, which were synthesized into findings. Results: Our findings revealed that providers understood the purpose and functionality of the AI tool and saw potential value for more accurate and faster diagnoses. However, successful adoption into routine patient care requires the smooth integration of the tool with clinical decision-making and existing workflow to address provider needs and preferences during implementation. To fulfill the AI tool's promise of clinical value, providers identified areas for improvement including integration with clinical decision-making, cost-effectiveness and resource allocation, provider training, workflow integration, care pathway coordination, and provider-patient communication. Conclusions: The implementation of AI-enabled tools in medicine can benefit from sensitivity to the nuanced context of care and provider needs to enable the useful adoption of AI tools at the point of care. Trial Registration: ClinicalTrials.gov NCT04000087; https://clinicaltrials.gov/ct2/show/NCT04000087 ", doi="10.2196/41940", url="https://ai.jmir.org/2022/1/e41940", url="http://www.ncbi.nlm.nih.gov/pubmed/38875550" }