<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e74111</article-id><article-id pub-id-type="doi">10.2196/74111</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Fine-Tuned Multimodal AI Chatbot for Dietary Health and Nutrition, Purrfessor: Development and Mixed Methods Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lu</surname><given-names>Linqi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Deng</surname><given-names>Yifan</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tian</surname><given-names>Chuan</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Sijia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shah</surname><given-names>Dhavan V</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Communication, University of North Dakota</institution><addr-line>221 Centennial Drive, Stop 7169</addr-line><addr-line>Grand Forks</addr-line><addr-line>ND</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Computer Science, University of Wisconsin&#x2013;Madison</institution><addr-line>Madison</addr-line><addr-line>WI</addr-line><country>United States</country></aff><aff id="aff3"><institution>School of Journalism and Mass Communication, University of Wisconsin&#x2013;Madison</institution><addr-line>Madison</addr-line><addr-line>WI</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Abbatantuono</surname><given-names>Chiara</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kim</surname><given-names>Kwanho</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Linqi Lu, PhD, Department of Communication, University of North Dakota, 221 Centennial Drive, Stop 7169, Grand Forks, ND, 58202, United States, 1 7017772137; <email>linqi.lu@und.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>30</day><month>4</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e74111</elocation-id><history><date date-type="received"><day>18</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>27</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Linqi Lu, Yifan Deng, Chuan Tian, Sijia Yang, Dhavan Shah. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 30.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e74111"/><abstract><sec><title>Background</title><p>The integration of Large Language and Vision Assistant models with food and nutrition data enables multimodal meal analysis and contextual dietary guidance. Despite this potential, the reliability and practical usefulness of such systems for supporting everyday dietary decision-making remain underexplored.</p></sec><sec><title>Objective</title><p>This study introduces Purrfessor, an innovative artificial intelligence (AI) chatbot designed to provide personalized dietary guidance through interactive, multimodal engagement. The study aimed to evaluate its performance in ingredient recognition and recipe generation.</p></sec><sec sec-type="methods"><title>Methods</title><p>The Purrfessor chatbot was trained using a combination of the FoodData Central database from the US Department of Agriculture (USDA), the Recipe2img dataset featuring food images and corresponding recipes, a curated human-annotated dataset derived from Recipe1M, and a customized question-and-answer dialogue dataset. The system operates under a session-based, multiturn interaction paradigm, with memory retained only within an active session and no cross-session memory persistence. We implemented a 2-phase evaluation framework combining AI-based performance assessment and human scoring.</p></sec><sec sec-type="results"><title>Results</title><p>Purrfessor achieved a high average cosine similarity of 0.90 in ingredient recognition with human-coded references. In GPT-4.1&#x2013;based (OpenAI) evaluation of recipe generation quality, Purrfessor outperformed the raw Large Language and Vision Assistant model across all evaluated dimensions, with the largest improvements in completeness (7.44 vs 6.52), consistency (8.90 vs 7.81), and clarity (9.13 vs 8.39). Overall recipe quality improved from 7.66 to 8.35. Automatic metrics indicated strong ingredient coverage (0.78) and moderate step complexity (0.74), with lower coherence (0.62) and temperature and time specification (0.59), yielding an overall structured score of 0.68. Human evaluators rated Purrfessor&#x2019;s question-and-answer accuracy highly: correctness (mean 8.71, SD 1.15), relevance (mean 9.99, SD 0.10), and clarity (mean 9.33, SD 0.68). Error analysis indicated that 56% of responses contained minor hallucinations (ie, inclusion of inferred secondary details or invisible garnishes). At the same time, core food identification and overall recipe logic remained accurate.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Findings highlight the role of anthropomorphic chatbot design and multimodal AI in supporting engaging dietary health conversations. This study offers an example of AI-driven, evidence-based dietary guidance and underscores the potential of health chatbots to nudge informed health decision-making. Insights contribute to the development of digital health interventions and personalized health communication strategies, with implications for the design of engaging, user-centered AI health assistants.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>Large Language and Vision Assistant</kwd><kwd>LLaVA</kwd><kwd>computer vision</kwd><kwd>large language models</kwd><kwd>LLMs</kwd><kwd>digital health</kwd><kwd>diet health</kwd><kwd>health technology</kwd><kwd>nutrition</kwd><kwd>diet recommendation</kwd><kwd>chatbot</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Overview</title><p>Artificial intelligence (AI) chatbots have increasingly become integrated into daily human interactions, assisting users across a range of domains, from customer service to personal health management. Despite the widespread potential, individuals in underserved communities frequently encounter significant barriers to healthy meal planning, including limited grocery options, economic constraints, and time pressures [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. The evolution from rule-based systems to advanced chatbots that leverage natural language processing, machine learning, and large language models (LLMs) has enabled real-time, context-sensitive interactions tailored to individual behavior. AI-powered conversational agents are transforming health care and lifestyle management, particularly in health interventions. Unlike traditional health-tracking apps (eg, MyFitnessPal; MyFitnessPal, Inc), AI-enhanced chatbots provide dynamic, adaptive guidance, fostering personalized user engagement [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. AI-driven health chatbots hold significant promise in addressing key behavioral factors associated with chronic conditions, including cardiovascular disease, type 2 diabetes, and obesity [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. While mainstream health apps offer static tracking, they lack the interactive and adaptive capabilities of AI chatbots, which integrate multimodal inputs for more responsive and personalized health guidance [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Leveraging computer vision in conjunction with LLMs, chatbots can analyze user-uploaded meal images, deliver prompt dietary feedback, and enhance their role in health behavior modification [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>This study introduces Purrfessor, a multimodal chatbot designed to provide personalized dietary guidance by fine-tuning the LLaVA-v1.6&#x2010;13B base model to enable advanced interactive functionalities [<xref ref-type="bibr" rid="ref12">12</xref>]. By leveraging user-uploaded food images and text prompts, Purrfessor uses structured data to map food types to nutritional information and recipe suggestions, enhancing the accuracy and relevance of dietary assessments [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. This study explores how AI-driven chatbots can serve as engaging health companions [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. By providing prompt, accessible meal suggestions based on available ingredients, the chatbot aims to help individuals and families build nutrition knowledge and support healthier dietary habits [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. This research contributes to the development of AI-driven health assistants that move beyond static recommendations, offering dynamic, context-aware support for sustained lifestyle modifications [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref17">17</xref>].</p></sec><sec id="s1-2"><title>Advancements in Multimodal AI for Health Chatbots</title><p>The integration of multimodal AI has expanded the capabilities of health chatbots, enabling dynamic, personalized dietary interventions. Unlike traditional health-tracking apps that rely on static databases, AI-driven chatbots can process real-time user inputs, including text and images, to offer adaptive health recommendations [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Studies indicate that chatbots can improve adherence to health guidelines and promote sustained behavior change [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. However, limitations remain, particularly in terms of accuracy, engagement, and trustworthiness. AI-generated recommendations risk hallucinations (fabricated content), inconsistent advice, and privacy concerns, posing challenges for dietary-tracking apps [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Addressing these challenges is crucial to enhancing user trust and long-term engagement in AI-powered health interventions [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Recent advancements in multimodal AI have sought to improve chatbot accuracy and engagement, with instruction tuning emerging as a key technique in refining AI-driven interactions. Large Language and Vision Assistant (LLaVA) represents one such innovation, integrating a visual encoder with an LLM to process combined image-text inputs. By leveraging structured instruction tuning, LLaVA enhances the chatbot&#x2019;s ability to interpret visual dietary data and generate relevant feedback, aligning closely with state-of-the-art language models and achieving more than 85% of GPT-4&#x2019;s (OpenAI) accuracy in image-text reasoning tasks [<xref ref-type="bibr" rid="ref12">12</xref>]. This development supports applications where AI-powered chatbots analyze user-uploaded images of meals, providing prompt, tailored dietary assessments [<xref ref-type="bibr" rid="ref11">11</xref>].</p></sec><sec id="s1-3"><title>Anthropomorphism and Chatbot Image</title><p>Chatbots that exhibit social characteristics tend to foster stronger relational bonds with users, thereby enhancing user engagement and adherence to health [<xref ref-type="bibr" rid="ref9">9</xref>]. However, delivering health interventions can be challenging because individuals often resist advice perceived as threatening their established habits or personal autonomy [<xref ref-type="bibr" rid="ref21">21</xref>]. Such resistance typically arises from psychological reactance, in which users perceive health advice as restrictive or authoritative, leading to disengagement or counterarguments [<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>Anthropomorphism, the attribution of human-like qualities to nonhuman entities, offers an effective strategy to mitigate this resistance. By displaying relatable, human-like traits, chatbots can facilitate more natural and socially engaging interactions [<xref ref-type="bibr" rid="ref23">23</xref>]. Attachment theory further supports this approach, suggesting users are more likely to develop emotional connections, trust, and ultimately adhere to health recommendations when interacting with chatbots that possess relatable, personable characteristics [<xref ref-type="bibr" rid="ref24">24</xref>]. Moreover, integrating interpersonal communication theories into chatbot interactions can enhance relational dynamics, thereby improving their capacity to influence user behavior [<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>According to the Media Are Social Actors paradigm, users respond socially to media entities that display recognizable social cues, even when they acknowledge these entities as artificial constructs [<xref ref-type="bibr" rid="ref26">26</xref>]. Adopting an approachable, cat-themed professor persona is a potential way to leverage these social interaction principles. This playful and companionable persona not only reduces resistance by avoiding an authoritative tone but also fosters an enjoyable, collaborative atmosphere that motivates users to explore dietary health behaviors [<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>System Overview</title><p>The Purrfessor chatbot is an AI-powered dietary assistant integrating natural language processing and computer vision to provide personalized nutrition guidance. The system architecture consists of a web-based user interface, a backend server facilitating data exchange, a conversation database for storing user interactions, and cloud-hosted AI models for text and image processing (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The workflow enables near real-time meal analysis, dietary recommendations, and interactive chatbot engagement.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>System architecture. LLaVA: Large Language and Vision Assistant.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e74111_fig01.png"/></fig></sec><sec id="s2-2"><title>System Components</title><p>The user interaction module provides a web-based chat interface for text-based conversations or for uploading food images for dietary analysis. The chatbot is visually represented by a cat-themed persona, &#x201C;Purrfessor,&#x201D; designed to convey an approachable yet knowledgeable character and enhance user engagement. The system also supports user authentication, enabling personalized dietary recommendations based on stored preferences and meal history. However, in the evaluated deployment, interactions are processed in a session-based setting without the retrieval of conversational history across sessions.</p><p>The backend infrastructure is implemented using a Node.js server that facilitates communication among the user interface, AI models, and database services. The server manages HTTP requests, processes application programming interface (API) calls, and maintains session continuity. User interactions are stored in a MongoDB conversation database, enabling conversational persistence, analysis of engagement patterns, and incremental personalization of responses over time.</p><p>The fine-tuned LLaVA model is deployed in a cloud-based computing environment to support scalable inference and multimodal processing. Cloud hosting enables efficient handling of image inputs and response generation for multiple concurrent users, while also supporting model updates and iterative fine-tuning to improve response accuracy and system performance.</p></sec><sec id="s2-3"><title>User Interaction and Engagement Features</title><p>The chatbot interface (<xref ref-type="fig" rid="figure2">Figure 2</xref>) is designed to support intuitive, seamless user interaction through visual prompts and guided navigation. To facilitate onboarding and early exploration, the interface displays prompt suggestions above the conversation bar that illustrate common interaction scenarios, such as requesting recipe ideas based on refrigerator contents or exploring specific dietary preferences. These prompts can be selected to automatically populate the input field, reducing user effort and lowering the barrier to initial engagement.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Purrfessor interface [<xref ref-type="bibr" rid="ref28">28</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e74111_fig02.png"/></fig><p>Navigation support is further enhanced by menu icons supplemented with tooltip-based hints that provide brief, context-specific explanations when users hover over or tap an icon (eg, &#x201C;Click here to generate a custom meal plan based on your preferences&#x201D;). Additional assistance is available via &#x201C;?&#x201D; icons adjacent to advanced features, allowing users to access clarification without disrupting the interaction flow. The interface also includes a visually distinct image upload button that enables users to submit photos of meals or ingredients for AI-driven analysis. Visual cues, such as color changes, encourage image uploads and highlight the chatbot&#x2019;s multimodal capabilities. For first-time users, a guided introduction outlines key functionalities, including prompt selection, image uploads, and navigation, demonstrating how to initiate conversations using example prompts or custom questions and reducing initial uncertainty during early use.</p></sec><sec id="s2-4"><title>Response Latency Metrics</title><p>We report response latency metrics for the system configuration used in all evaluations presented in this study (2025 deployment). The system uses streaming inference with a virtual large language model on a single NVIDIA RTX 6000 Ada Graphics Processing Unit (GPU). For a typical response of approximately 500 tokens, input token processing required approximately 2&#x2010;3 seconds, after which output tokens were streamed to the user as they were generated. Full response generation, including server-side processing and database forwarding, was completed within approximately 10&#x2010;12 seconds.</p></sec><sec id="s2-5"><title>Training Data Sources</title><p>The fine-tuning dataset was curated from three sources: (1) FoodData Central: Foundation Foods, which is a structured food and nutrition database from the US Department of Agriculture (USDA), which provides detailed nutritional profiles [<xref ref-type="bibr" rid="ref29">29</xref>]; (2) Recipe2img dataset: a dataset containing paired food images and recipe descriptions, designed for learning cross-modal representations between text and visuals [<xref ref-type="bibr" rid="ref11">11</xref>]; and (3) a human-annotated dataset: a curated collection of food images labeled with nutritional information and cooking instructions, derived from the Recipe1M dataset. This dataset was used to construct instruction-tuning prompts and to enhance the contextual relevance of dietary recommendations.</p></sec><sec id="s2-6"><title>Human-Annotated Data Preparation</title><p>First, a Google Image Search method was implemented to systematically collect food-related images from online sources using Python (Python Software Foundation) and the Google Custom Search API, an approach adapted from similar image-retrieval practices in data-intensive research [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. The collection process was facilitated through the Google API Python Client library [<xref ref-type="bibr" rid="ref32">32</xref>], with a custom search engine ID and API key configured to automate retrieval. A set of predefined search queries, tailored with keywords on raw produce and ingredients for cooking (ie, &#x201C;raw food,&#x201D; &#x201C;produce and meat in fridge,&#x201D; &#x201C;fresh produce,&#x201D; &#x201C;produce in fridge,&#x201D; &#x201C;food ingredients,&#x201D; &#x201C;raw meat and produce,&#x201D; &#x201C;cooking raw meat and vegetables,&#x201D; and &#x201C;fresh produce for cooking&#x201D;) was used to filter relevant images and metadata. Metadata extracted for each image included the collection date, page title, source website, image URL, and page URL for image validation use. Where available, dates were extracted directly from URLs using regular expressions, allowing chronological organization of data [<xref ref-type="bibr" rid="ref33">33</xref>]. Images were accessed via licensed use of Google Custom Search API in compliance with Google&#x2019;s terms of service.</p><p>Second, the image captioning and corresponding question-and-answer (Q&#x0026;A) were generated via GPT-4o (OpenAI). Each image in the dataset was processed using a prompt specifically designed to elicit detailed captions and chatbot-friendly Q&#x0026;A examples. The prompt instructed the model to generate the following: a detailed caption describing the visible ingredients; a Q&#x0026;A response including a greeting, nutritional information for the ingredients, healthy recipe suggestions, a step-by-step guide for each recipe, and a closing message encouraging user engagement. To maintain consistency, each prompt followed a structured output format, facilitating easy parsing and subsequent training of the LLaVA model. The inclusion of nutritional information and recipe suggestions was essential to simulate real-world interaction, thereby training the model to respond informatively and engagingly [<xref ref-type="bibr" rid="ref34">34</xref>].</p><p>Third, we completed the human review and editing for the compiled GPT (OpenAI) output. Following the initial output generation, human annotators reviewed each caption and Q&#x0026;A example for clarity, accuracy, and alignment with expected chatbot responses. Annotators checked for language appropriateness, grammatical correctness, logical flow, edge case handling, and special characters. For ambiguous or nonfood images, annotators crafted captions and Q&#x0026;A responses neutrally, providing factual descriptions of the content per the prompt guidelines [<xref ref-type="bibr" rid="ref33">33</xref>]. Additionally, edge cases were included in the full dataset, with Q&#x0026;A examples revised by humans, ensuring robust training data that would prepare the model to handle diverse real-world scenarios.</p></sec><sec id="s2-7"><title>Chatbot Fine-Tuning Approach</title><p>The chatbot was fine-tuned using LLaVA-v1.6-Vicuna-13B, a multimodal LLM that integrates an open-set visual encoder from Contrastive Language&#x2013;Image Pretraining with the Vicuna language decoder. Fine-tuning was performed on a high-performance GPU computing server featuring dual NVIDIA RTX 6000 Ada Generation GPUs (48 GB VRAM per GPU; 18,176 CUDA cores; and 568 Tensor cores), a 64-core CPU, and 512 GB RAM. Training used Ubuntu 22.04 with the Lambda Stack environment. The designed architecture of Purrfessor enables the model to process and interpret both visual and textual inputs within instruction-based contexts, making it suitable for tasks such as image interpretation, dialogue reasoning, and dietary guidance [<xref ref-type="bibr" rid="ref12">12</xref>]. The data structure contains (1) the Food Data Central: Foundation Foods from the USDA, (2) Recipe2img dataset Salvador et al [<xref ref-type="bibr" rid="ref11">11</xref>] (n=3000), and (3) a human-annotated visual dataset (n=500). The instruction-tuning dataset also included supportive opening and closing language, enhancing the emotional support provided to users.</p><p>Initially, full fine-tuning of LLaVA-v1.6-Vicuna-13B was attempted using the Recipe1M dataset. However, the high GPU memory requirements for tuning a 13B parameter model exceeded hardware constraints. To address this, Low-Rank Adaptation (LoRA) was implemented, allowing efficient fine-tuning by modifying a subset of model parameters. Early LoRA experiments revealed overfitting, where the model defaulted to generating generic cooking instructions for all queries, irrespective of context. To mitigate this issue, additional human-annotated visual data were introduced to improve context-awareness in meal recommendations. Fine-tuning incorporated instruction-tuning prompts that emphasized ingredient recognition and personalized dietary suggestions rather than generic recipe generation. To ensure the chatbot-generated precise nutrition-related responses, the Central Foundation Foods dataset was integrated into the instruction tuning. Knowledge injection techniques refined the chatbot&#x2019;s ability to distinguish food components and provide relevant nutritional insights. Additionally, structured prompt engineering ensured that chatbot interactions maintained a natural conversational flow, balancing factual dietary information with engaging, supportive dialogue.</p><p>The fine-tuned Purrfessor chatbot integrates AI-driven dietary recommendations with personalized, image-based analysis. The model&#x2019;s training pipeline optimized its ability to deliver context-aware meal-planning suggestions, reducing bias toward predefined responses and improving its adaptability to user-specific queries.</p></sec><sec id="s2-8"><title>Evaluation Framework</title><p>To assess the performance of the fine-tuned visual chatbot Purrfessor, we adopted a mixed methods, 2-phase evaluation framework. This design integrated both automated scoring mechanisms and human validation procedures to capture technical performance, contextual relevance, semantic robustness, and user-centered feedback.</p></sec><sec id="s2-9"><title>Simulation Dataset</title><p>This study randomly selected 100 real-world images from a second-round Google Image Search pool, which served as a held-out evaluation set and did not overlap with any training or fine-tuning data. To emulate authentic use cases, we paired each image with a synthetically generated user prompt, yielding 100 Q&#x0026;A pairs. Image content represented a wide range of food types, including everyday meals, beverages, and raw ingredients, to reflect the variety users might encounter in real-world applications. Prompt diversity included direct identification tasks (eg, &#x201C;What food-related items appeared in this image?&#x201D;) and contextual queries (eg, &#x201C;Do you have ideas on recipes with low-fat per the current ingredients I provide?&#x201D;). This ensured the 100 Q&#x0026;A pairs captured both fundamental visual object detection and high-level reasoning in meal planning. To evaluate the food detection and recipe quality, prompts were designed to reflect a range of interaction intents, including visual identification tasks (eg, &#x201C;Please comprehensively list all food-related items appeared in this image?&#x201D; and &#x201C;Please only list food-related items appeared in this image with high confidence&#x201D;), and general recipe generation (eg, &#x201C;Based on this image, what food recipe would you recommend?&#x201D;).</p></sec><sec id="s2-10"><title>Food Detection Performance</title><p>To assess ingredient recognition capabilities, we implemented a semantic evaluation pipeline based on Bidirectional Encoder Representations from Transformers, using the all-MiniLM-L6-v2 model. Ground truth ingredient labels were human-coded, and model predictions were compared using a cosine similarity-based matching algorithm with a minimum semantic threshold of 0.7. Evaluation was performed under 3 prompt conditions: comprehensive detection (the model was prompted to list all visible food items exhaustively), confident detection (the model was prompted to name only items it could identify with high confidence), and naturalistic prompting (the model was given open-ended prompts such as, &#x201C;What food recipe would you recommend based on this image?&#x201D; without explicitly requesting full ingredient enumeration).</p></sec><sec id="s2-11"><title>Recipe Quality: LLM-as-a-Judge</title><p>To evaluate recipe generation, we adopted the G-Eval framework [<xref ref-type="bibr" rid="ref35">35</xref>], a chain-of-thought evaluation method implemented using GPT-4.1 as an expert synthetic evaluator. Each generated recipe was evaluated across 7 key dimensions: correctness (alignment with image content), relevance (task adherence), clarity (readability and structure), completeness (coverage of cooking steps), consistency (logical flow), practicality (procedural feasibility of executing the recipe once the required ingredients are available), and safety (food safety awareness). Scores were assigned on a 10-point Likert scale, and a weighted average was computed as the overall quality metric. This &#x201C;LLM-as-a-Judge&#x201D; method has been increasingly validated for evaluating natural language generation systems. Each recipe sample was evaluated based on its alignment with the corresponding food image and user prompt. The evaluation script passed three structured components to the model: (1) the user prompt, (2) the generated recipe, and (3) the base64-encoded image. A system prompt instructed GPT-4.1 to follow a stepwise chain-of-thought procedure involving (1) visual analysis of the dish image, (2) breakdown of the recipe content (eg, ingredients, steps, and safety), and (3) cross-modal validation between image and text.</p></sec><sec id="s2-12"><title>Recipe-Specific Automatic Evaluation</title><p>To complement LLM-based evaluation, we applied a reference-free, domain-specific scoring framework [<xref ref-type="bibr" rid="ref36">36</xref>]. Four structured metrics were implemented: ingredient coverage (consistency between ingredients listed in the &#x201C;Ingredients Overview&#x201D; and those used in the recipe body), step complexity (capturing instructional richness through operation diversity, sentence length, and parameter inclusion), recipe coherence (validating logical sequencing and completeness using rule-based checks), and temperature and time specification (the presence and plausibility of safety-critical parameters). Scores ranged from 0 to 1. An overall weighted quality score was computed using the following weights: coverage (0.3), complexity (0.2), coherence (0.3), and specification (0.2). The system is implemented in Python 3.8 (Python Software Foundation) using <italic>NLTK</italic> and <italic>NumPy</italic> and supports scalable recipe evaluation.</p></sec><sec id="s2-13"><title>Human Evaluation</title><p>Three human coders manually reviewed and rated each of the 100 chatbot-generated responses based on correctness, relevance, and clarity (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app2">2</xref>). Two raters were independent research assistants who volunteered their time, and one rater was a study author. All raters received the same training and applied identical evaluation criteria. Each dimension was rated on a 10-point scale, adapted from validated chatbot assessment frameworks [<xref ref-type="bibr" rid="ref16">16</xref>]. Coders were trained before evaluation, and a scoring rubric was provided to ensure interrater consistency. Krippendorff &#x03B1; was calculated to assess intercoder reliability, which ranged from 0.85 to 1.00 across dimensions.</p></sec><sec id="s2-14"><title>Ethical Considerations</title><p>This study used a simulation-based evaluation design and did not involve direct interaction with human participants for research data collection. Human evaluation was conducted on chatbot-generated responses rather than on human behavioral or personal data. No personal or sensitive data were collected or analyzed. As no human participants were recruited, informed consent was not required, and no compensation was provided. All electronic research data were stored on secure, password-protected institutional servers accessible only to authorized research team members. The system evaluated in this study operates under a session-based configuration without cross-session memory persistence, and no real user interaction data were collected or analyzed for this research. This research project was reviewed and approved by the Institutional Review Board at the University of Wisconsin&#x2013;Madison (2023&#x2010;1416).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Food Detection Evaluation</title><p>We first evaluated ingredient recognition performance across 3 prompting scenarios. In the comprehensive detection task, where the chatbot was instructed to identify all visible ingredients exhaustively, the average cosine similarity between predicted and ground-truth ingredient pairs improved from 0.80 with the original raw LLaVA model to 0.84 with the fine-tuned Purrfessor model, reflecting a 5% gain in semantic alignment. In the confident prompt condition, where the chatbot was asked to list only clearly visible ingredients, similarity further increased from 0.85 to 0.87 (+2%).</p><p>To assess model behavior under naturalistic prompting conditions, we conducted a simulation using 100 randomly sampled, image-based user queries (eg, &#x201C;Based on the image, what food recipe would you recommend?&#x201D;). Although exhaustive ingredient enumeration was not explicitly requested, chatbot responses consistently included an &#x201C;Ingredient Overview&#x201D; section listing salient food items. When benchmarked against human-coded labels, the model achieved a precision of 0.65, indicating that the majority of predicted ingredients were relevant. The average cosine similarity of 0.90 across matched ingredient pairs suggests strong semantic alignment, even in the absence of exact lexical matches.</p></sec><sec id="s3-2"><title>Recipe Quality Evaluation</title><p>To compare end-to-end recipe generation quality across models, we used the LLM-as-a-Judge framework with GPT-4.1 in the G-Eval structured evaluation pipeline [<xref ref-type="bibr" rid="ref35">35</xref>]. For each sample, models were prompted with &#x201C;Using the food and ingredients shown in this image, please recommend a healthy recipe and estimate its nutritional information.&#x201D; Generated outputs were rated across 7 dimensions to yield an overall quality score [<xref ref-type="bibr" rid="ref35">35</xref>]: correctness, relevance, clarity, completeness, consistency, practicality, and safety, using a 10-point scale. A weighted average was computed to produce an overall quality score. As shown in <xref ref-type="table" rid="table1">Table 1</xref>, the Purrfessor model consistently outperformed the raw LLaVA model across all 7 dimensions. Paired <italic>t</italic> tests were conducted on 100 matched samples to compare the fine-tuned Purrfessor model with the raw LLaVA baseline. Results indicated that the fine-tuned model achieved significantly higher scores across all 7 evaluation dimensions, including correctness (<italic>t</italic><sub>99</sub>=4.23; <italic>P</italic>&#x003C;.001), relevance (<italic>t</italic><sub>99</sub>=3.11; <italic>P</italic>=.002), clarity (<italic>t</italic><sub>99</sub>=6.03; <italic>P</italic>&#x003C;.001), completeness (<italic>t</italic><sub>99</sub>=6.95; <italic>P</italic>&#x003C;.001), consistency (<italic>t</italic><sub>99</sub>=7.45; <italic>P</italic>&#x003C;.001), practicality (<italic>t</italic><sub>99</sub>=4.28; <italic>P</italic>&#x003C;.001), and safety (<italic>t</italic><sub>99</sub>=5.10; <italic>P</italic>&#x003C;.001).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>GPT-4.1 (G-Eval) structured evaluation of recipe generation quality comparing the raw LLaVA-v1.6-13B model and the fine-tuned Purrfessor multimodal dietary chatbot across 100 simulated food image&#x2013;prompt pairs (United States, 2025 system deployment).<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Evaluation dimension</td><td align="left" valign="bottom">Raw LLaVA<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, mean (SD)</td><td align="left" valign="bottom">Purrfessor, mean (SD)</td><td align="left" valign="bottom">Absolute improvement</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Correctness</td><td align="left" valign="top">6.14 (1.53)</td><td align="left" valign="top">6.72 (1.52)</td><td align="left" valign="top">0.58</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Relevance</td><td align="left" valign="top">8.01 (1.73)</td><td align="left" valign="top">8.47 (1.79)</td><td align="left" valign="top">0.46</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top">Clarity</td><td align="left" valign="top">8.39 (1.18)</td><td align="left" valign="top">9.13 (0.87)</td><td align="left" valign="top">0.74</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">6.52 (1.32)</td><td align="left" valign="top">7.44 (1.06)</td><td align="left" valign="top">0.92</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Consistency</td><td align="left" valign="top">7.81 (1.45)</td><td align="left" valign="top">8.90 (1.11)</td><td align="left" valign="top">1.09</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Practicality</td><td align="left" valign="top">8.52 (1.27)</td><td align="left" valign="top">9.11 (1.12)</td><td align="left" valign="top">0.59</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Safety</td><td align="left" valign="top">8.24 (1.27)</td><td align="left" valign="top">8.67 (1.02)</td><td align="left" valign="top">0.43</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Overall score (weighted average)</td><td align="left" valign="top">7.66</td><td align="left" valign="top">8.35</td><td align="left" valign="top">0.69</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Recipes were generated in response to image-based dietary guidance queries and evaluated using a large language model-as-a-judge framework (GPT-4.1) across 7 dimensions: correctness, relevance, clarity, completeness, consistency, practicality, and safety (10-point Likert scale). Paired <italic>t</italic> tests were conducted on 100 matched samples to assess improvements attributable to domain-specific fine-tuning. The overall weighted score represents a composite across all 7 dimensions. All evaluations were conducted under a session-based deployment configuration without cross-session memory persistence.</p></fn><fn id="table1fn2"><p><sup>b</sup>LLaVA: Large Language and Vision Assistant.</p></fn><fn id="table1fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>To complement human-based assessment, we implemented a domain-specific automatic evaluation framework based on four reference-free, structure-aware metrics: (1) ingredient coverage, (2) step complexity, (3) recipe coherence, and (4) temperature and time specification. These metrics emphasize internal consistency and executability, independent of reference text similarity. On the same 100 image-prompted samples, the model achieved an ingredient coverage score of 0.78, reflecting strong alignment between the &#x201C;Ingredients Overview&#x201D; and ingredients actually used in recipe steps. The step complexity score of 0.74 indicated moderately detailed instruction sets with diverse cooking operations. Recipe coherence was lower at 0.62, revealing some violations of expected temporal and logical sequencing (eg, adding ingredients before preheating). The temperature and time specification score of 0.59 suggested partial inclusion of cooking duration and temperature parameters. The aggregated overall structured evaluation score was 0.68, placing recipe quality in the good range, with opportunities for improvement in procedural coherence and parameter completeness.</p></sec><sec id="s3-3"><title>Human Evaluations</title><p>Human evaluation was conducted on 100 Q&#x0026;A simulations generated by Purrfessor, assessing 3 core criteria (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Human evaluation of 100 simulated image-based dietary Q&#x0026;A responses generated by the fine-tuned Purrfessor multimodal chatbot (United States, 2025 system deployment).<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Evaluation criteria</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Krippendorff &#x03B1; for high performance (8-10)</td></tr></thead><tbody><tr><td align="left" valign="top">Correctness</td><td align="left" valign="top">8.71 (1.15)</td><td align="left" valign="top">0.85</td></tr><tr><td align="left" valign="top">Relevance</td><td align="left" valign="top">9.99 (0.10)</td><td align="left" valign="top">1.00</td></tr><tr><td align="left" valign="top">Clarity</td><td align="left" valign="top">9.33 (0.68)</td><td align="left" valign="top">0.95</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>All results were generated using the 2025 system deployment. Three trained raters evaluated chatbot responses to food image-prompt pairs using 10-point Likert scales across 3 dimensions: correctness, relevance, and clarity. Mean scores and SDs are reported. Krippendorff &#x03B1; was calculated to assess interrater reliability for high-performance ratings (scores 8&#x2010;10).</p></fn></table-wrap-foot></table-wrap><list list-type="order"><list-item><p>Correctness (mean 8.71, SD 1.15): correctness ratings primarily reflect the accuracy of core elements, such as the identification of main ingredients and the alignment of the recommended cooking approach with the image content and user query. Chatbot responses were largely accurate, though minor discrepancies occurred with visually similar food items (eg, distinguishing arugula from green lettuce) or low-quality images.</p></list-item><list-item><p>Relevance (mean 9.99, SD 0.1): chatbot-generated responses appropriately addressed user queries, although context-specific prompts occasionally received generalized answers.</p></list-item><list-item><p>Clarity (mean 9.33, SD 0.68): responses were well-structured and aligned with training formats. However, output truncation due to token limitations occasionally resulted in incomplete responses.</p></list-item></list><p>A follow-up error analysis examined the nature of inaccuracies in chatbot-generated responses. Overall, 12% of responses exhibited ambiguity (eg, vague ingredient references or unclear procedural steps) and 56% contained hallucinations. An additional 4% of responses exhibited both ambiguity and hallucination, whereas 36% showed no detectable errors. In this analysis, hallucinations were defined as the inclusion of unsupported or inferred details, including commonly used ingredients and seasonings that were not visible in the input image but were mentioned in the responses. Importantly, hallucinations were coded even when the primary food identification and overall recipe logic were otherwise accurate. As a result, many responses that received high correctness scores in the human evaluation were still classified as containing hallucinations, reflecting that correctness and hallucination capture different aspects of response quality.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study introduced the function and architecture of Purrfessor, a fine-tuned multimodal LLaVA chatbot designed for personalized dietary guidance through image-based and conversational interactions. Using a mixed methods evaluation framework that combined GPT-based recipe quality assessment, reference-free automated metrics, and human validation, we systematically examined the effects of domain-specific fine-tuning on food recognition and recipe generation tasks.</p><p>Across evaluations, Purrfessor consistently outperformed its raw LLaVA base model on all 7 GPT-4 rated recipe quality dimensions, with the largest gains observed in clarity, consistency, and completeness. Ingredient recognition under naturalistic prompting achieved a high average cosine similarity (0.90), indicating robust semantic alignment between detected foods and user-facing descriptions even when prompts did not explicitly request exhaustive food identification. These findings suggest that domain-specific fine-tuning can meaningfully enhance multimodal semantic grounding and response quality in dietary contexts.</p><p>At the same time, the evaluation results reveal important distinctions between different dimensions of system performance. High scores on clarity reflect the model&#x2019;s strength in generating fluent, well-structured, and easily interpretable natural language responses. In contrast, the lower automated recipe coherence score highlights occasional limitations in maintaining correct temporal or logical sequencing of cooking steps. This divergence underscores that linguistic fluency and procedural reasoning represent related but distinct capabilities, and that strong natural language generation does not necessarily guarantee fully executable or logically optimal cooking procedures. Together, these findings illustrate the value of multidimensional evaluation frameworks for multimodal health AI systems, as different metrics capture complementary aspects of usability, correctness, and practical reasoning.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>These findings build on and extend existing research in health chatbot design and multimodal AI. While previous research has established that LLMs can generate health-related responses with contextual relevance [<xref ref-type="bibr" rid="ref1">1</xref>], relatively few have investigated the capabilities of integrated vision-language systems for food recognition and personalized nutrition guidance. Findings demonstrate that such systems, when fine-tuned with domain-specific data, can achieve high levels of semantic alignment in ingredient identification and generate user-aligned recipe recommendations.</p><p>Beyond usability, Purrfessor could enhance nutrition and meal planning among laypeople who may lack familiarity with digital environments. By leveraging natural language and image-based interactions, Purrfessor enables users to ask questions easily and receive prompt, understandable responses grounded in USDA-supported nutritional data, lowering the intimidation often associated with complex health apps or dense databases. Purrfessor can help lay users expand their nutrition knowledge by explaining the benefits of various food choices and suggesting recipes tailored to individual needs. For individuals with limited food resources, Purrfessor can offer practical suggestions for affordable alternatives that maintain essential nutrient intake, supporting healthier dietary practices in constrained contexts. Additionally, the chatbot&#x2019;s friendly, anthropomorphic avatar design may support nutrition education among younger users, fostering healthy eating habits through daily engagement and gentle nudges toward better choices. Overall, this study reinforces the emerging view that multimodal, anthropomorphically designed chatbots can function not only as information tools but as relational companions in digital health contexts.</p><p>Meanwhile, recent work has increasingly adopted LLMs such as GPT-4 as evaluators for open-ended generation tasks, demonstrating strong alignment with human judgments on dimensions including clarity, relevance, and safety [<xref ref-type="bibr" rid="ref35">35</xref>]. However, prior studies have also cautioned that LLM-as-a-Judge frameworks may overestimate performance when the evaluator shares training distributions or representational priors with the evaluated system. Consistent with this literature, this study treats GPT-based evaluation as an upper-bound estimate and complements it with reference-free, rule-based metrics that emphasize procedural correctness and executability. This combined approach highlights both the strengths and limitations of current LLM-based evaluation paradigms in multimodal health AI.</p></sec><sec id="s4-3"><title>Limitations</title><p>Despite the promising findings of this study, several limitations should be acknowledged. First, the chatbot&#x2019;s context awareness was limited to session-level memory. While the system supports multiturn interactions and can maintain conversational context within an active session, it does not retain conversation history once the session is closed or the interface is reloaded. As a result, contextual continuity across separate user visits is not preserved. While findings highlight the importance of persistent memory for improving long-term personalization and conversational coherence, this study did not evaluate specific implementation strategies for cross-session memory retention.</p><p>Second, although the core chatbot model is based on LLaVA-v1.6-13B, GPT-4o was used to assist with structured training data generation, and a GPT-4&#x2013;based model was used as an evaluator under the G-Eval (&#x201C;LLM-as-a-Judge&#x201D;) framework. While training data were subsequently reviewed and edited by human annotators and the evaluation model was not used for training, this model-family overlap introduces a potential circular validation bias that may inflate performance estimates. Accordingly, results derived from G-Eval should be interpreted as upper-bound estimates rather than fully independent validations.</p><p>Third, the relevance dimension showed a near-ceiling effect (mean 9.99, SD 0.10) in human scoring, likely reflecting the design limitations of the human evaluation rubric. Relevance was defined primarily in terms of task-level appropriateness, whether the response directly addressed the user&#x2019;s question, rather than the degree of contextual specificity or personalization. Consequently, responses that were occasionally described qualitatively as generalized could still receive high relevance scores if they adequately addressed the prompt.</p><p>Fourth, although the study distinguishes hallucinations based on visual or presentational salience, hallucinations involving nutritionally or medically relevant ingredients (eg, salt, oil, and sugar) may pose material risks for users with specific dietary restrictions or chronic health conditions. While such ingredients may be visually inconspicuous, their presence or absence can have meaningful health implications in certain contexts. The current system does not implement ingredient-level verification or medical rule enforcement, as introducing strict constraints or external validation layers may reduce the flexibility and creativity that are central to recipe generation and user engagement.</p><p>Additionally, while real-world food images were used to enhance ecological realism at the input level, the evaluation relied on synthetically generated prompts for standardization. Such prompts may not fully capture the variability and noise of unconstrained human interactions, and thus, the findings primarily reflect model performance under structured task conditions.</p><p>Finally, although the fine-tuned Purrfessor model is compared against its raw LLaVA base to isolate the effects of domain-specific fine-tuning, this evaluation does not establish comparative performance against other state-of-the-art dietary assistants or text-only LLMs provided with structured ingredient inputs. Thus, the results should be interpreted as evidence of within-architecture improvement rather than global model superiority.</p></sec><sec id="s4-4"><title>Future Work</title><p>Future research may explore ways to support continuity across user sessions, allowing the system to maintain context over time while carefully addressing privacy, data governance, and efficiency considerations. Longitudinal studies of sustained use would help clarify how conversational continuity influences personalization, user experience, and dietary decision support. In addition, future development may consider incorporating clearer safeguards around ingredient accuracy, particularly for individuals with dietary restrictions or chronic health conditions, where small inaccuracies could have meaningful implications.</p><p>Methodological improvements are also needed to strengthen evaluation rigor. Future studies may include independent evaluation approaches that do not rely on LLMs, as well as greater involvement of domain experts in assessing output quality. Refining human evaluation instruments to better capture contextual depth and personalization could further distinguish between responses that simply address a prompt and those that demonstrate more adaptive, situation-specific reasoning.</p><p>Finally, future research should assess system performance under more naturalistic conditions. Collecting and analyzing organically generated user prompts would provide insight into how the system performs amid the variability and unpredictability of real-world interactions. Broader comparisons with other multimodal and text-based systems would also help determine whether the observed improvements reflect model-specific tuning effects or more general advances in multimodal dietary guidance.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study presented the design and evaluation of Purrfessor, a fine-tuned multimodal LLaVA-based chatbot developed to support multimodal dietary guidance. Using a mixed methods evaluation framework that combined simulation testing, automated metrics, and human scoring, we assessed the system performance in ingredient recognition and recipe generation quality.</p><p>Compared with the base LLaVA model, the fine-tuned Purrfessor model demonstrated improved semantic alignment in ingredient recognition and achieved higher scores across all 7 GPT-4.1&#x2013;rated recipe quality dimensions, including correctness, clarity, completeness, consistency, practicality, and safety. Automated structured metrics further identified strengths in ingredient coverage and step complexity, alongside limitations in procedural coherence and temperature and time specification. Human evaluations further showed high ratings for correctness, relevance, and clarity of chatbot responses, while error analysis highlighted the presence of hallucinations in secondary details, underscoring the need to interpret accuracy and safety as distinct dimensions of system performance.</p><p>These findings indicate that domain-specific fine-tuning of a multimodal vision-language model can improve performance on food-related reasoning tasks under controlled simulation conditions. The results provide empirical evidence regarding the measurable capabilities and limitations of multimodal dietary chatbots evaluated through mixed methods assessment.</p></sec></sec></body><back><ack><p>A US provisional patent application related to the Purrfessor chatbot (application no. 63/734,112) was filed on December 15, 2024. The provisional application was not converted to a nonprovisional patent and has expired; no active or enforceable patent protection exists. The authors report no current commercial, licensing, or financial interests related to this work. We also appreciate Yuting Wang and Zihan Wan for volunteering their time and assistance with the human validation process.</p></ack><notes><sec><title>Funding</title><p>Funding to support access to the UW-MCRC Graphics Processing Unit (GPU) for housing and running the large language models, as well as licensing fees for software access, was provided by a grant from the John S. and James L. Knight Foundation: G-2019-58809.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request. Validation data used in this study are available at the Open Science Framework (OSF) [<xref ref-type="bibr" rid="ref37">37</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: LL, YD, CT, DS, SY</p><p>Data curation: LL, YD, CT</p><p>Formal analysis: LL</p><p>Methodology: LL, YD, CT</p><p>Supervision: DS, SY</p><p>Validation: LL, YD, CT</p><p>Writing-original draft preparation: LL, YD, CT</p><p>Writing-review &#x0026; editing: LL, YD, CT, DS, SY</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">GPU</term><def><p>Graphics Processing Unit</p></def></def-item><def-item><term id="abb4">LLaVA</term><def><p>Large Language and Vision Assistant</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">LoRA</term><def><p>Low-Rank Adaptation</p></def></def-item><def-item><term id="abb7">Q&#x0026;A</term><def><p>question and answer</p></def></def-item><def-item><term id="abb8">USDA</term><def><p>US Department of Agriculture</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>R</given-names> </name><name name-style="western"><surname>Todd</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wardak</surname><given-names>S</given-names> </name><name name-style="western"><surname>Partridge</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Raeside</surname><given-names>R</given-names> </name></person-group><article-title>Feasibility and acceptability of chatbots for nutrition and physical activity health promotion among adolescents: systematic scoping review with adolescent consultation</article-title><source>JMIR Hum Factors</source><year>2023</year><month>05</month><day>5</day><volume>10</volume><issue>1</issue><fpage>e43227</fpage><pub-id pub-id-type="doi">10.2196/43227</pub-id><pub-id pub-id-type="medline">37145858</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Achiche</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pomey</surname><given-names>MP</given-names> </name><etal/></person-group><article-title>Adapting and evaluating an AI-based chatbot through patient and stakeholder engagement to provide information for different health conditions: master protocol for an adaptive platform trial (the MARVIN Chatbots Study)</article-title><source>JMIR Res Protoc</source><year>2024</year><month>02</month><day>13</day><volume>13</volume><issue>1</issue><fpage>e54668</fpage><pub-id pub-id-type="doi">10.2196/54668</pub-id><pub-id pub-id-type="medline">38349734</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nadarzynski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Miles</surname><given-names>O</given-names> </name><name name-style="western"><surname>Cowie</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ridge</surname><given-names>D</given-names> </name></person-group><article-title>Acceptability of artificial intelligence (AI)-led chatbot services in healthcare: a mixed-methods study</article-title><source>Digit Health</source><year>2019</year><volume>5</volume><fpage>2055207619871808</fpage><pub-id pub-id-type="doi">10.1177/2055207619871808</pub-id><pub-id pub-id-type="medline">31467682</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laranjo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Tong</surname><given-names>HL</given-names> </name><etal/></person-group><article-title>Conversational agents in healthcare: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>09</month><day>1</day><volume>25</volume><issue>9</issue><fpage>1248</fpage><lpage>1258</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy072</pub-id><pub-id pub-id-type="medline">30010941</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Klopfenstein</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Delpriori</surname><given-names>S</given-names> </name><name name-style="western"><surname>Malatini</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bogliolo</surname><given-names>A</given-names> </name></person-group><article-title>The rise of bots: a survey of conversational interfaces, patterns, and paradigms</article-title><year>2017</year><conf-name>Proceedings of the 2017 Conference on Designing Interactive Systems</conf-name><conf-date>Jun 10-14, 2017</conf-date><conf-loc>Edinburgh, United Kingdom</conf-loc><fpage>555</fpage><lpage>565</lpage><pub-id pub-id-type="doi">10.1145/3064663.3064672</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Farhud</surname><given-names>DD</given-names> </name></person-group><article-title>Impact of lifestyle on health</article-title><source>Iran J Public Health</source><year>2015</year><month>11</month><volume>44</volume><issue>11</issue><fpage>1442</fpage><lpage>1444</lpage><pub-id pub-id-type="medline">26744700</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cecchini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sassi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lauer</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YY</given-names> </name><name name-style="western"><surname>Guajardo-Barron</surname><given-names>V</given-names> </name><name name-style="western"><surname>Chisholm</surname><given-names>D</given-names> </name></person-group><article-title>Tackling of unhealthy diets, physical inactivity, and obesity: health effects and cost-effectiveness</article-title><source>Lancet</source><year>2010</year><month>11</month><day>20</day><volume>376</volume><issue>9754</issue><fpage>1775</fpage><lpage>1784</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(10)61514-0</pub-id><pub-id pub-id-type="medline">21074255</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>Nearly half of Americans use digital voice assistants, mostly on their smartphones</article-title><source>Pew Research Center</source><year>2017</year><month>12</month><day>12</day><access-date>2026-04-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.pewresearch.org/fact-tank/2017/12/12/nearly-half-of-americans-use-digital-voice-assistants-mostly-on-their-smartphones/">https://www.pewresearch.org/fact-tank/2017/12/12/nearly-half-of-americans-use-digital-voice-assistants-mostly-on-their-smartphones/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Fukuoka</surname><given-names>Y</given-names> </name></person-group><article-title>A systematic review of artificial intelligence chatbots for promoting physical activity, healthy diet, and weight loss</article-title><source>Int J Behav Nutr Phys Act</source><year>2021</year><month>12</month><day>11</day><volume>18</volume><issue>1</issue><fpage>160</fpage><pub-id pub-id-type="doi">10.1186/s12966-021-01224-6</pub-id><pub-id pub-id-type="medline">34895247</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maher</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Curtis</surname><given-names>RG</given-names> </name><name name-style="western"><surname>Short</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>KJ</given-names> </name></person-group><article-title>A physical activity and diet program delivered by artificially intelligent virtual health coach: proof-of-concept study</article-title><source>JMIR Mhealth Uhealth</source><year>2020</year><month>07</month><day>10</day><volume>8</volume><issue>7</issue><fpage>e17558</fpage><pub-id pub-id-type="doi">10.2196/17558</pub-id><pub-id pub-id-type="medline">32673246</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Salvador</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hynes</surname><given-names>N</given-names> </name><name name-style="western"><surname>Aytar</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Learning cross-modal embeddings for cooking recipes and food images</article-title><conf-name>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jul 21-26, 2017</conf-date><conf-loc>Honolulu, HI</conf-loc><fpage>3020</fpage><lpage>3028</lpage><pub-id pub-id-type="doi">10.1109/CVPR.2017.327</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>C</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YJ</given-names> </name></person-group><article-title>Visual instruction tuning</article-title><year>2024</year><conf-name>Advances in Neural Information Processing Systems (NeurIPS 2023)</conf-name><conf-date>Dec 10-16, 2026</conf-date><conf-loc>New Orleans, Louisiana, USA</conf-loc><fpage>34892</fpage><lpage>34916</lpage><pub-id pub-id-type="doi">10.48550/arXiv.2304.08485</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nazary</surname><given-names>F</given-names> </name><name name-style="western"><surname>Deldjoo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Di Noia</surname><given-names>T</given-names> </name></person-group><article-title>ChatGPT-healthprompt: harnessing the power of XAI in prompt-based healthcare decision support using chatgpt</article-title><year>2023</year><conf-name>European Conference on Artificial Intelligence (ECAI 2023)</conf-name><conf-date>Sep 30 to Oct 5, 2023</conf-date><conf-loc>Krak&#x00F3;w, Poland</conf-loc><fpage>382</fpage><lpage>397</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-50396-2_22</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bickmore</surname><given-names>TW</given-names> </name><name name-style="western"><surname>Caruso</surname><given-names>L</given-names> </name><name name-style="western"><surname>Clough-Gorr</surname><given-names>K</given-names> </name><name name-style="western"><surname>Heeren</surname><given-names>T</given-names> </name></person-group><article-title>&#x2018;It&#x2019;s just like you talk to a friend&#x2019; relational agents for older adults</article-title><source>Interact Comput</source><year>2005</year><month>12</month><volume>17</volume><issue>6</issue><fpage>711</fpage><lpage>735</lpage><pub-id pub-id-type="doi">10.1016/j.intcom.2005.09.002</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Fadhil</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gabrielli</surname><given-names>S</given-names> </name></person-group><article-title>Addressing challenges in promoting healthy lifestyles: the AI-chatbot approach</article-title><conf-name>Proceedings of the 11th EAI International Conference on Pervasive Computing Technologies for Healthcare</conf-name><conf-date>May 23-26, 2017</conf-date><conf-loc>Barcelona, Spain</conf-loc><fpage>261</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1145/3154862.3154914</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pereira</surname><given-names>J</given-names> </name><name name-style="western"><surname>D&#x00ED;az</surname><given-names>&#x00D3;</given-names> </name></person-group><article-title>Using health chatbots for behavior change: a mapping study</article-title><source>J Med Syst</source><year>2019</year><month>04</month><day>4</day><volume>43</volume><issue>5</issue><fpage>135</fpage><pub-id pub-id-type="doi">10.1007/s10916-019-1237-1</pub-id><pub-id pub-id-type="medline">30949846</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miner</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Laranjo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kocaballi</surname><given-names>AB</given-names> </name></person-group><article-title>Chatbots in the fight against the COVID-19 pandemic</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><issue>1</issue><fpage>65</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-0280-0</pub-id><pub-id pub-id-type="medline">32377576</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patil</surname><given-names>D</given-names> </name><name name-style="western"><surname>Iyer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mehta</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gavand</surname><given-names>D</given-names> </name></person-group><article-title>Dietbot - diet recommending chatbot</article-title><source>Int J Innov Res Technol</source><year>2021</year><access-date>2026-03-10</access-date><volume>7</volume><issue>11</issue><fpage>709</fpage><lpage>712</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://universalcollegeofengineering.edu.in/wp-content/uploads/2024/03/1-201-60-63.pdf">https://universalcollegeofengineering.edu.in/wp-content/uploads/2024/03/1-201-60-63.pdf</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkaissi</surname><given-names>H</given-names> </name><name name-style="western"><surname>McFarlane</surname><given-names>SI</given-names> </name></person-group><article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title><source>Cureus</source><year>2023</year><month>02</month><volume>15</volume><issue>2</issue><fpage>e35179</fpage><pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id><pub-id pub-id-type="medline">36811129</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaishya</surname><given-names>R</given-names> </name><name name-style="western"><surname>Misra</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vaish</surname><given-names>A</given-names> </name></person-group><article-title>ChatGPT: is this version good for healthcare and research?</article-title><source>Diabetes Metab Syndr</source><year>2023</year><month>04</month><volume>17</volume><issue>4</issue><fpage>102744</fpage><pub-id pub-id-type="doi">10.1016/j.dsx.2023.102744</pub-id><pub-id pub-id-type="medline">36989584</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rains</surname><given-names>SA</given-names> </name></person-group><article-title>The nature of psychological reactance revisited: a meta-analytic review</article-title><source>Hum Commun Res</source><year>2013</year><month>01</month><volume>39</volume><issue>1</issue><fpage>47</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.1111/j.1468-2958.2012.01443.x</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Lane</surname><given-names>LT</given-names> </name><name name-style="western"><surname>Deatrick</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Young</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Potts</surname><given-names>KA</given-names> </name></person-group><article-title>Psychological reactance and promotional health messages: the effects of controlling language, lexical concreteness, and the restoration of freedom</article-title><source>Human Comm Res</source><year>2007</year><month>04</month><volume>33</volume><issue>2</issue><fpage>219</fpage><lpage>240</lpage><pub-id pub-id-type="doi">10.1111/j.1468-2958.2007.00297.x</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gambino</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name></person-group><article-title>Considering the context to build theory in HCI, HRI, and HMC: explicating differences in processes of communication and socialization with social technologies</article-title><source>HMC</source><year>2022</year><volume>4</volume><fpage>111</fpage><lpage>130</lpage><pub-id pub-id-type="doi">10.30658/hmc.4.6</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pentina</surname><given-names>I</given-names> </name></person-group><article-title>Attachment theory as a framework to understand relationships with social chatbots: a case study of replika</article-title><year>2022</year><conf-name>Hawaii International Conference on System Sciences</conf-name><conf-date>Jan 4-7, 2022</conf-date><pub-id pub-id-type="doi">10.24251/HICSS.2022.258</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fox</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gambino</surname><given-names>A</given-names> </name></person-group><article-title>Relationship development with humanoid social robots: applying interpersonal theories to human-robot interaction</article-title><source>Cyberpsychol Behav Soc Netw</source><year>2021</year><month>05</month><volume>24</volume><issue>5</issue><fpage>294</fpage><lpage>299</lpage><pub-id pub-id-type="doi">10.1089/cyber.2020.0181</pub-id><pub-id pub-id-type="medline">33434097</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lombard</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>K</given-names> </name></person-group><article-title>Social responses to media technologies in the 21st century: the media are social actors paradigm</article-title><source>Hum Mach Commun</source><year>2021</year><month>01</month><volume>2</volume><fpage>29</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.30658/hmc.2.2</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aggarwal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tam</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Qiao</surname><given-names>S</given-names> </name></person-group><article-title>Artificial intelligence-based chatbots for promoting health behavioral changes: systematic review</article-title><source>J Med Internet Res</source><year>2023</year><month>02</month><day>24</day><volume>25</volume><fpage>e40789</fpage><pub-id pub-id-type="doi">10.2196/40789</pub-id><pub-id pub-id-type="medline">36826990</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>C</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name></person-group><article-title>Purrfessor chatbot conceptual model (interactive prototype) [webpage]</article-title><source>Purrfessor</source><year>2024</year><access-date>2024-07-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://purrfessorbot.netlify.app/">https://purrfessorbot.netlify.app/</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>FoodData Central: Foundation Foods</article-title><source>US Department of Agriculture (USDA), Agricultural Research Service</source><year>2024</year><access-date>2026-04-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://fdc.nal.usda.gov/">https://fdc.nal.usda.gov/</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Redmon</surname><given-names>J</given-names> </name><name name-style="western"><surname>Farhadi</surname><given-names>A</given-names> </name></person-group><article-title>YOLOv3: an incremental improvement</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 8, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1804.02767</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Kwon</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Evaluating large vision-language models for visual framing analysis in news imagery: a theory-driven benchmark</article-title><access-date>2026-01-20</access-date><conf-name>the 59th Hawaii International Conference on System Sciences (HICSS 2026)</conf-name><conf-date>Jan 6-9, 2026</conf-date><conf-loc>Maui, HI, USA</conf-loc><fpage>2716</fpage><lpage>2725</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/10125/111721">https://hdl.handle.net/10125/111721</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Custom search JSON API</article-title><source>Google</source><year>2023</year><access-date>2026-04-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://developers.google.com/custom-search/v1/overview">https://developers.google.com/custom-search/v1/overview</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Mitchell</surname><given-names>R</given-names> </name></person-group><source>Web Scraping with Python: Collecting More Data from the Modern Web</source><year>2018</year><access-date>2025-11-10</access-date><edition>2</edition><publisher-name>O&#x2019;Reilly Media</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.oreilly.com/library/view/web-scraping-with/9781491985564/">https://www.oreilly.com/library/view/web-scraping-with/9781491985564/</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Child</surname><given-names>R</given-names> </name><name name-style="western"><surname>Luan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Amodei</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Learning transferable visual models from natural language supervision</article-title><year>2021</year><access-date>2026-03-10</access-date><conf-name>Proceedings of the International Conference on Machine Learning</conf-name><conf-date>Jul 18-24, 2021</conf-date><conf-loc>Virtual</conf-loc><fpage>14108</fpage><lpage>14119</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v139/radford21a.html">https://proceedings.mlr.press/v139/radford21a.html</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Iter</surname><given-names>D</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>C</given-names> </name></person-group><article-title>G-eval: NLG evaluation using GPT-4 with better human alignment</article-title><year>2023</year><conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 6-10, 2023</conf-date><conf-loc>Singapore</conf-loc><fpage>2511</fpage><lpage>2522</lpage><pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.153</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Vij</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Nair</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>TE</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bhowmick</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Fine-tuning language models for recipe generation: a comparative analysis and benchmark study</article-title><source>arXiv</source><access-date>2025-10-10</access-date><comment>Preprint posted online on  Feb 4, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.02028</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>Purrfessor</article-title><source>Open Science Framework (OSF)</source><access-date>2026-04-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://osf.io/53wdx/?view_only=467de7cff0a7425ba4e70f5da7c844ca">https://osf.io/53wdx/?view_only=467de7cff0a7425ba4e70f5da7c844ca</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Human scoring example.</p><media xlink:href="ai_v5i1e74111_app1.docx" xlink:title="DOCX File, 40 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Human scoring criteria.</p><media xlink:href="ai_v5i1e74111_app2.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>