<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e77988</article-id><article-id pub-id-type="doi">10.2196/77988</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Augmenting Large Language Model With Prompt Engineering and Supervised Fine-Tuning in Non-Small Cell Lung Cancer Tumor-Node-Metastasis Staging: Framework Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Jin</surname><given-names>Ruonan</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Ling</surname><given-names>Chao</given-names></name><degrees>BEng</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hou</surname><given-names>Yixuan</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sun</surname><given-names>Yuhan</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Ning</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Han</surname><given-names>Jiefei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sheng</surname><given-names>Jin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Qizhao</given-names></name><degrees>BMed</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Yuepeng</given-names></name><degrees>BEng</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zheng</surname><given-names>Shen</given-names></name><degrees>BEng</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ren</surname><given-names>Xingyu</given-names></name><degrees>MEng</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Chiyu</given-names></name><degrees>MEng</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Jue</given-names></name><degrees>MCM</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Li</surname><given-names>Cheng</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Liangyihui Network Technology Co, Ltd</institution><addr-line>9/F, Tower T2, Jinheshangcheng, 140 Tianlin Road, Xuhui District</addr-line><addr-line>Shanghai</addr-line><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff2"><institution>School of Medicine, Tongji University</institution><addr-line>Shanghai</addr-line><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Thoracic Surgery, Renmin Hospital of Wuhan University</institution><addr-line>Wuhan</addr-line><addr-line>Hubei</addr-line><country>China</country></aff><aff id="aff4"><institution>Department of Neuro-oncology, Neurosurgery Center, Beijing Tiantan Hospital, Capital Medical University</institution><addr-line>Beijing</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff5"><institution>Department of Medical Oncology, Sir Run Run Shaw Hospital, School of Medicine, Zhejiang University</institution><addr-line>Hangzhou</addr-line><addr-line>Zhejiang</addr-line><country>China</country></aff><aff id="aff6"><institution>NoDesk AI Technology Co.,Ltd</institution><addr-line>Hangzhou</addr-line><addr-line>Zhejiang</addr-line><country>China</country></aff><aff id="aff7"><institution>Zhipu AI Technology Co, Ltd</institution><addr-line>Beijing</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff8"><institution>Department of Biomedical Engineering, Faculty of Engineering, National University of Singapore</institution><addr-line>Singapore</addr-line><addr-line>Singapore</addr-line><country>Singapore</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Pengan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lampridis</surname><given-names>Savvas</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Cheng Li, PhD, Liangyihui Network Technology Co, Ltd, 9/F, Tower T2, Jinheshangcheng, 140 Tianlin Road, Xuhui District, Shanghai, Shanghai, 200233, China, 86 13636599645; <email>nightwhisper147@hotmail.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>15</day><month>4</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e77988</elocation-id><history><date date-type="received"><day>23</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>28</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>29</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Ruonan Jin, Chao Ling, Yixuan Hou, Yuhan Sun, Ning Li, Jiefei Han, Jin Sheng, Qizhao Wang, Yuepeng Liu, Shen Zheng, Xingyu Ren, Chiyu Chen, Jue Wang, Cheng Li. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 15.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e77988"/><abstract><sec><title>Background</title><p>Accurate tumor node metastasis (TNM) staging is fundamental for treatment planning and prognosis in non-small cell lung cancer (NSCLC). However, its complexity poses significant challenges. Traditional rule-based natural language processing methods are constrained by their reliance on manually crafted rules and are susceptible to inconsistencies in clinical reporting.</p></sec><sec><title>Objective</title><p>This study aimed to develop and validate a robust, accurate, and operationally efficient artificial intelligence framework for the TNM staging of NSCLC by strategically enhancing a large language model, GLM-4-Air (general language model), through advanced prompt engineering and supervised fine-tuning (SFT).</p></sec><sec sec-type="methods"><title>Methods</title><p>We constructed a curated dataset of 492 deidentified real-world medical imaging reports, with TNM staging annotations rigorously validated by senior physicians according to the AJCC (American Joint Committee on Cancer) 8th edition guidelines. The GLM-4-Air model was systematically optimized via a multi-phase process: iterative prompt engineering incorporating chain-of-thought reasoning and domain knowledge injection for all staging tasks, followed by parameter-efficient SFT using low-rank adaptation for the reasoning-intensive primary tumor characteristics (T) and regional lymph node involvement (N) staging tasks. The final hybrid model was evaluated on a completely held-out test set (black-box) and benchmarked against GPT-4o using standard metrics, statistical tests, and a clinical impact analysis of staging errors.</p></sec><sec sec-type="results"><title>Results</title><p>The optimized hybrid GLM-4-Air model demonstrated reliable performance. It achieved higher staging accuracies on the black-box test set: 92% (95% CI 0.850&#x2010;0.959) for T, 86% (95% CI 0.779&#x2010;0.915) for N, 92% (95% CI 0.850&#x2010;0.959) for distant metastasis status (M), and 90% for overall clinical staging; by comparison, GPT-4o attained 87% (95% CI 0.790&#x2010;0.922), 70% (95% CI 0.604&#x2010;0.781), 78% (95% CI 0.689&#x2010;0.850), and 80%, respectively. The model&#x2019;s robustness was further evidenced by its macro-average <italic>F</italic><sub>1</sub>-scores of 0.914 (T), 0.815 (N), and 0.831 (M), consistently surpassing those of GPT-4o (0.836, 0.620, and 0.698). Analysis of confusion matrices confirmed the model&#x2019;s proficiency in identifying critical staging features while effectively minimizing false negatives. Crucially, the clinical impact assessment showed a substantial reduction in severe category I errors, which are defined as misclassifications that could significantly influence subsequent clinical decisions. Our model committed 0 category I errors in M staging and fewer category I errors in T and N staging. Furthermore, the framework demonstrated practical deployability, achieving efficient inference on consumer-grade hardware (eg, 4 RTX 4090 GPUs) with latencies suitable and acceptable for clinical workflows.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The proposed hybrid framework, integrating structured prompt engineering and applying SFT to reasoning-heavy tasks (T/N), enables the GLM-4-Air model to serve as a highly accurate, clinically reliable, and cost-efficient solution for automated NSCLC TNM staging. This work demonstrates the efficacy and potential of a domain-optimized smaller model compared with an off-the-shelf generalist model, holding promise for enhancing diagnostic standardization in resource-aware health care environments.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>clinical decision support</kwd><kwd>non-small cell lung cancer</kwd><kwd>TNM staging</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>GLM-32B</kwd><kwd>GPT-4o</kwd><kwd>prompt engineering</kwd><kwd>supervised fine tuning</kwd><kwd>diagnostic standardization</kwd><kwd>grassroots health care improvement</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>On February 2, 2024, the National Cancer Center of China released &#x201C;Cancer incidence and mortality in China, 2022.&#x201D; The latest data indicate that lung cancer remains the leading cause of cancer incidence and mortality in China [<xref ref-type="bibr" rid="ref1">1</xref>]. Lung cancer can be divided into 2 main histopathological types: non-small cell lung cancer (NSCLC) and small cell lung cancer. According to statistical data, NSCLC accounts for approximately 85% of all cases [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>The tumor node metastasis (TNM) staging system, developed by the American Joint Committee on Cancer (AJCC), serves as the foundation for NSCLC treatment planning, outcome assessment, and clinical research design [<xref ref-type="bibr" rid="ref3">3</xref>]. It evaluates 3 critical parameters: primary tumor characteristics (T), regional lymph node involvement (N), and distant metastasis status (M) [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Although tumor characteristics are described in medical imaging reports, the explicit TNM stage classification is rarely included [<xref ref-type="bibr" rid="ref5">5</xref>]. Physicians must extract and interpret the relevant information from the reports and apply the complex TNM stage classification criteria to determine the stage of the patients. The intricacy of the criteria makes it challenging even for experienced physicians to memorize all the details, posing an even greater challenge for early-career physicians or those in primary care settings.</p><p>NLP approaches reported in the TNM stage classification primarily consist of rule-based methods and deep learning methods. Among the reported research, rule-based NLP approaches have achieved a maximum accuracy of approximately 85% for T and N staging [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. For M staging, a peak accuracy of about 93% has been reached by Park et al [<xref ref-type="bibr" rid="ref8">8</xref>] using a deep learning NLP method.</p><p>While progress has been made, NLP approaches extensively rely on intricately crafted rules [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] and the annotated datasets of substantial size [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Besides, the overall accuracy is heavily affected by report standardization, terminology consistency, and contextual clarity in clinical documentation [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Recently, large language models (LLMs) have become a hot topic due to their ability to capture complex patterns and structures of language without the above-mentioned limitations of traditional NLP [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>In the study conducted by Nakamura et al [<xref ref-type="bibr" rid="ref16">16</xref>], the performance of GPT-4 in extracting lung cancer staging from CT radiological reports was evaluated. When guided by TNM staging rules, the model demonstrated varying accuracy rates: 52.2% for T staging, 78.9% for N staging, and 86.7% for M staging without extra training. They indicated that most of ChatGPT&#x2019;s errors were caused by challenges with numerical reasoning or insufficiency in anatomical or lexical knowledge.</p><p>Another study by Matsuo et al [<xref ref-type="bibr" rid="ref17">17</xref>] investigated the performance of multilingual LLMs in interpreting TNM staging from both Japanese and English radiological reports. The researchers found that providing comprehensive TNM definitions notably improved the model&#x2019;s accuracy. The highest accuracy was achieved for reports in English: 47% for T staging, 80% for N staging, 94% for M staging, and an overall accuracy of 36%.</p><p>Additionally, based on a fine-tuned method, a study by Fujimoto [<xref ref-type="bibr" rid="ref18">18</xref>] developed an LLM that was fine-tuned on an augmented dataset consisting of 27 lung cancer cases. They noted that fine-tuning can help the model understand the relationships between different aspects of TNM classification, such as how tumor size relates to the T stage and how lymph node involvement affects the N stage. They also observed that the T stage contains several types of criteria other than tumor size and is therefore more complex compared with the M stage.</p><p>Previous applications of LLMs in NSCLC TNM staging have demonstrated promising performance in N and M stage classification. However, these methods show limited accuracy in T staging, which requires a more precise assessment of tumor characteristics, including size, invasion depth, and local extension. This underscores the necessity for further research to develop more robust methods for analyzing complex tumor characteristics [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Moreover, the clinical implementation of artificial intelligence (AI) technologies necessitates thorough cost-effectiveness evaluation, particularly as data privacy concerns often mandate local (on-premises) deployment. The substantial operational costs associated with many commercial LLMs present a dual challenge: high application programming interface fees for cloud-based services and, for local deployment, the significant hardware investment required to run large-parameter models, which often exceed the capabilities of consumer-grade equipment. These cost barriers critically impact the feasibility of integrating such technologies into routine clinical practice.</p><p>Given the challenges encountered in prior studies, this research focuses on optimizing GLM-4-Air (general language model), an LLM with 32 billion parameters, for a more robust and cost-effective solution to NSCLC TNM stage classification. The training framework we developed specifically aims to improve TNM staging accuracy and efficiency for NSCLC, and its performance was rigorously evaluated using a held-out internal test set to ensure an unbiased assessment within this study&#x2019;s scope. The ultimate goal is to enhance the standardization of NSCLC cancer management in grassroots medical settings in mainland China as well as in other developing countries.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This study used a structured pipeline that systematically progresses from raw medical report processing to final clinical staging output, as illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. This end-to-end framework facilitates reproducible model development through a structured transformation of unstructured reports into staging conclusions that adhere to clinical standards.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The comprehensive workflow from model development to clinical application. GLM: general language model; M: distant metastasis status; N: regional lymph node involvement; OCR: optical character recognition; SFT: supervised fine-tuning; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig01.png"/></fig></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was approved by the Shanghai Ethics Committee for Clinical Research (SECCR2025-260) as a retrospective analysis. The requirement for informed consent was waived due to the retrospective nature of the study and the use of deidentified data. All patient information was anonymized prior to analysis, with all identifiers, medical record number/visit number, hospital name, attending physician&#x2019;s name, examining doctor&#x2019;s name, and other personally identifiable information such as ID number and contact details removed, to ensure privacy and confidentiality. No compensation was provided to participants as this was a retrospective study using existing clinical data. The study did not involve any images that could potentially identify individual participants. This research was conducted in accordance with the principles of the Declaration of Helsinki and adhered to all relevant regional and national research ethics guidelines.</p></sec><sec id="s2-3"><title>Dataset</title><sec id="s2-3-1"><title>Data Collection</title><p>The data for this study were collected through our proprietary medical record management platform, which was specifically developed to assist patients in managing their medical records. The study enrolled participants who were initially suspected of lung cancer through imaging or clinical presentation and subsequently received histopathological or cytological confirmation of NSCLC. All included cases had undergone relevant clinical management between January 2018 and May 2025, with available medical records containing extractable key elements essential for TNM staging, such as tumor size, location, nodal status, and metastatic findings.</p><p>The finalized dataset comprised photos of medical imaging reports for up to 492 cases of NSCLC. Using optical character recognition (OCR) technology provided by Tencent Co Ltd [<xref ref-type="bibr" rid="ref22">22</xref>], we converted the captured images into text format. Following the OCR process, we applied no further processing to enhance the text&#x2019;s quality. Nevertheless, we specifically examined the OCR outputs of both white-box and black-box and ran an automated post-OCR Chinese character error rate test measured on the 27 reports randomly selected from both white-box and black-box.</p></sec><sec id="s2-3-2"><title>Data Annotation</title><p>The annotation process was conducted by 2 expert-level lung cancer physicians, who performed 3 independent rounds of detailed TNM staging annotations in accordance with the 8th edition AJCC TNM classification guidelines. For the definition and interpretation of professional medical terms, we referred to the standards of the Union for International Cancer Control. All annotators were blinded to the previously annotated results to ensure objectivity. Discrepancies in the annotations were systematically documented and discussed with additional physicians to reach a consensus. Finally, high-quality gold standard annotations for subsequent model development and evaluation were established.</p><p>In addition, to quantify the degree of interannotator agreement and the reliability of annotations, which serves as a human-performance benchmark for calibrating model performance, we calculated the Cohen kappa coefficient for all 3 stages in both white-box and black-box test sets among the clinical annotators, specifically prior to the adjudication process.</p></sec><sec id="s2-3-3"><title>Dataset Splitting</title><p>The overall procedure of dataset splitting is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. We randomly divided all 492 collected and annotated cases into training (n1=292) and test (n2=200) datasets. For hyperparameter tuning (including learning rate selection) and early stopping, we strictly used a dedicated validation set. This set was randomly held out from the original 292-case training dataset (n1), constituting about 20% (58 cases) of it.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The overall procedure of dataset splitting in this study. NSCLC: non-small cell lung cancer.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig02.png"/></fig><p>The test dataset (n2) was further subdivided into a white-box subset for model development and a black-box subset for final performance evaluation (100 cases each). It is important to clarify that both subsets were derived from the same internal pool of user-uploaded cases within our platform. The term &#x201C;black-box&#x201D; specifically refers to this subset&#x2019;s role as a rigorously held-out, internal test set that remained completely inaccessible during all phases of model development and tuning, thereby ensuring an unbiased final assessment. It was not an external validation cohort from distinct institutions or independent collection pipelines. A stratified sampling strategy was used during data partitioning to ensure a balanced representation of all stages in both subsets, thereby supporting a robust and generalizable evaluation of model performance. The white-box subset was kept completely separate and was never used for any training or tuning decisions. Its role was exclusively for intermediate evaluation at the end of the development cycle to provide a single, neutral performance snapshot and preliminary model diagnosis.</p></sec><sec id="s2-3-4"><title>Data Distribution</title><p>The dataset characteristics and stratification are summarized as follows and detailed in the Multimedia Appendices. As detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, the imaging modality distribution (n=492) was 84.76% CT, 13.01% PET-CT, with the remainder comprising other modalities (eg, MRI, Ultrasound). Regarding hospital sources, 78% of 492 cases originated from 51 distinct contributing hospitals, while the source was unidentified for the remaining 22% due to users&#x2019; proactive removal of hospital identifiers for privacy protection during upload. Subsequently, <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> presents the detailed class distribution for the T and N categories of the training set, and T, N, and M categories, along with the overall clinical stages of the test sets according to annotation. Both the training and test sets were designed to ensure coverage of all staging subcategories with a relatively balanced distribution.</p></sec></sec><sec id="s2-4"><title>Criteria for TNM and Clinical Staging</title><p>In this study, to enhance the practical application of AJCC 8th Edition guidelines within Chinese clinical contexts, we developed supplementary TNM staging rules through expert clinical consultation. These rules address specific scenarios either not explicitly covered in the official guidelines or requiring disambiguation for Chinese medical terminology. A comprehensive mapping table (<xref ref-type="table" rid="table1">Table 1</xref>) documents each rule alongside its corresponding AJCC guideline reference. The supplementary rules primarily address: (1) Clinical interpretations for anatomically complex scenarios (eg, trans-lobar growth), and (2) Disambiguation of Chinese medical terms with overlapping semantics (eg, distinguishing between local invasion vs metastatic spread).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Mapping of supplementary staging rules to the American Joint Committee on Cancer (AJCC) 8th edition guidelines.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Tumor node metastasis stage</td><td align="left" valign="top">AJCC guidelines</td><td align="left" valign="top">Supplementary rules</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">T<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (primary tumor)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top">No primary tumor</td><td align="left" valign="top">1. Maximum Tumor Diameter: In imaging reports, multiple tumor measurements may be present due to factors such as prior surgeries or treatments. The maximum tumor diameter is explicitly defined as the most recent measurement obtained following the current treatment cycle. Care should be taken to avoid misinterpreting size information from nonneoplastic nodules, such as &#x201C;solid nodules.&#x201D; 2. Only explicit expressions such as &#x201C;vertebral metastasis,&#x201D; &#x201C;invasion,&#x201D; &#x201C;encroachment,&#x201D; or &#x201C;occupation&#x201D; in the report should be considered indicators of tumor invasion. Other expressions should not be used as criteria for invasion. (Definition of &#x201C;Tumor Invasion&#x201D;) 3. A nodule should be considered a cancer nodule only if the report explicitly describes it as &#x201C;malignant soft tissue&#x201D; or an equivalent severe condition. For example, &#x201C;a malignant soft tissue mass in the right middle lobe, with multiple nodules in both lungs, suggestive of metastasis.&#x201D; Other less severe descriptions should not be used as criteria for the presence of a cancer nodule (Definition of &#x201C;Cancer Nodule&#x201D;).</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tis</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tis</td><td align="left" valign="top">Carcinoma in situ (squamous or adenocarcinoma)</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1</td><td align="left" valign="top">Tumor &#x2264;3 cm</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a(mi)</td><td align="left" valign="top">Minimally invasive adenocarcinoma</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a</td><td align="left" valign="top">Superficial spreading tumor in central airways<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a</td><td align="left" valign="top">Tumor &#x2264;1 cm</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1b</td><td align="left" valign="top">Tumor &#x003E;1 but &#x2264;2 cm</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1c</td><td align="left" valign="top">Tumor &#x003E;2 but &#x2264;3 cm</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td><td align="left" valign="top">Tumor &#x003E;3 but &#x2264;5 cm or tumor involving: visceral pleura, main bronchus (not carina), atelectasis to hilum<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">Tumor invades the left or right main bronchus or visceral pleura.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2a</td><td align="left" valign="top">Tumor &#x003E;3 but &#x2264;4 cm</td><td align="left" valign="top">Note: This supplementary rule conveys the same definition as the AJCC guidelines, with the only modification being the explicit inclusion of &#x201C;left&#x201D; and &#x201C;right&#x201D; specifications, which are not explicitly specified in the original AJCC guidelines.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2b</td><td align="left" valign="top">Tumor &#x003E;4 but &#x2264;5 cm</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td><td align="left" valign="top">Tumor &#x003E;5 but &#x2264;7 cm or invading chest wall, pericardium, phrenic nerve; or separate tumor nodule(s) in the same lobe</td><td align="left" valign="top">1. Tumor size &#x003C;5 cm with trans-lobar growth. 2. Tumor invades the parietal pleura (usually noted in pathological staining), chest wall (including Pancoast tumors). Note: Based on the AJCC guidelines, this supplementary rule adds only the scenario involving invasion of the parietal pleura, and the description of chest wall invasion has been expanded to include cases of Pancoast tumors. 3. Tumor invades the main stems of more distal arteries or veins. Note: Based on the AJCC guidelines, this supplementary criterion adds only the scenario involving invasion of the main stems of more distal arteries or veins.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td><td align="left" valign="top">Tumor &#x003E;7 cm or tumor invading: mediastinum, diaphragm, heart, great vessels, recurrent laryngeal nerve, carina, trachea, esophagus, spine; or tumor nodule(s) in a different ipsilateral lobe</td><td align="left" valign="top">1. Tumor size 5&#x2010;7 cm with trans-lobar growth. 2. Tumor invades major vessels (aorta, superior vena cava, inferior vena cava, main pulmonary arteries, and left or right pulmonary veins within the pericardium), vertebral bodies. Note: Based on the AJCC guidelines, this supplementary rule adds the definition of major blood vessels provided in parentheses and includes the scenario involving invasion of the vertebral bodies.</td></tr><tr><td align="left" valign="top" colspan="3">N<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> (regional lymph nodes)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td><td align="left" valign="top">No regional node metastasis</td><td align="left" valign="top">1. Expressions such as &#x201C;small nodules&#x201D; or &#x201C;mild fluorodeoxyglucose (FDG) uptake&#x201D; should be interpreted with caution and should not be directly inferred as lymph node metastasis. Only explicit statements such as &#x201C;lymph node metastasis,&#x201D; &#x201C;enlarged lymph nodes,&#x201D; or &#x201C;markedly abnormal lymph nodes&#x201D; should be considered evidence of lymph node metastasis. (Definition of &#x201C;Lymph Node Metastasis&#x201D;) 2. PET-CT<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> reports were prioritized for evaluation. If unavailable, contrast-enhanced CT reports were used as the secondary reference. In the absence of both PET-CT and contrast-enhanced CT reports, lymph node status was directly classified as undetermined.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td><td align="left" valign="top">Metastasis in ipsilateral pulmonary or hilar nodes</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td><td align="left" valign="top">Metastasis in ipsilateral mediastinal or subcarinal nodes</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N3</td><td align="left" valign="top">Metastasis in contralateral mediastinal, hilar, or supraclavicular nodes</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top" colspan="3">M<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup> (distant metastasis)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M0</td><td align="left" valign="top">No distant metastasis</td><td align="left" valign="top">1. The requirement for cautious interpretation of terms such as &#x201C;&#x5C0F;&#x7ED3;&#x8282;&#x201D; (small nodule), &#x201C;&#x4F4E;&#x5BC6;&#x5EA6;&#x7ED3;&#x8282;&#x5F71;&#x201D; (low-density nodular shadow), &#x201C;&#x81F4;&#x5BC6;&#x5F71;&#x201D; (dense shadow), &#x201C;&#x9AD8;&#x5BC6;&#x5EA6;&#x5F71;&#x201D; (high-density shadow), &#x201C;&#x5F3A;&#x56DE;&#x58F0;&#x4F34;&#x58F0;&#x5F71;&#x201D; (strong echo with acoustic shadow), &#x201C;&#x8F6F;&#x5316;&#x7076;&#x201D; (softened focus), and &#x201C;&#x6761;&#x72B6;&#x5F3A;&#x5316;&#x5F71;&#x201D; (linear enhancement), preventing their direct classification as metastatic lesions. 2. To mainly consider the expressions of &#x201C;&#x8F6C;&#x79FB;&#x6027;&#x201D; (metastatic), &#x201C;&#x8003;&#x8651;&#x8F6C;&#x79FB;&#x201D; (consider metastasis), &#x201C;&#x7591;&#x4F3C;&#x8F6C;&#x79FB;&#x201D; (suspected metastasis), &#x201C;&#x5BC6;&#x5EA6;&#x589E;&#x9AD8;&#x5F71;&#x201D; (increased density shadow), or &#x201C;&#x9AD8;&#x5BC6;&#x5EA6;&#x7ED3;&#x8282;&#x201D; (high-density nodule) in other distant organs, as definitive evidence for metastasis.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1a</td><td align="left" valign="top">Malignant pleural or pericardial effusions or pleural or pericardial nodules or separate tumor nodule(s) in a contralateral lobe</td><td align="left" valign="top">Metastasis is classified as M1a if it is confined to the thoracic cavity and the report explicitly describes tumor involvement of the pleura or pericardium with metastatic nodules.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1b</td><td align="left" valign="top">Single extrathoracic metastasis</td><td align="left" valign="top">NULL</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1c</td><td align="left" valign="top">Multiple extrathoracic metastases (1 or &#x003E;1 organ)</td><td align="left" valign="top">NULL</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>T: primary tumor characteristics.</p></fn><fn id="table1fn2"><p><sup>b</sup>Superficial spreading tumor of any size but confined to the tracheal or bronchial wall.</p></fn><fn id="table1fn3"><p><sup>c</sup>Atelectasis or obstructive pneumonitis extending to hilum; such tumors are classified as T2a if &#x003E;3 and &#x2264;4 cm, T2b if &#x003E;4 and &#x2264;5 cm. Pleural effusions that are cytologically negative, nonbloody, transudative, and clinically judged not to be due to cancer are excluded.</p></fn><fn id="table1fn4"><p><sup>d</sup>N: regional lymph node involvement.</p></fn><fn id="table1fn5"><p><sup>e</sup>PET-CT: positron emission tomography&#x2013;computed tomography.</p></fn><fn id="table1fn6"><p><sup>f</sup>M: distant metastasis status.</p></fn></table-wrap-foot></table-wrap><p>Note that all supplementary logic maintains consistency with AJCC principles while improving alignment with Chinese clinical documentation practices. These enhancements ensure more accurate staging outcomes without contradicting established guideline definitions.</p><p>In accordance with our supplementary staging rules, we adopted a conservative interpretation for M staging: lesions are classified as metastatic only when the report contains terminology with high specificity for metastasis (eg, &#x201C;&#x8F6C;&#x79FB;&#x201D; and &#x201C;&#x8003;&#x8651;&#x8F6C;&#x79FB;&#x201D;). Descriptions such as &#x201C;&#x5C0F;&#x7ED3;&#x8282;&#x201D; (small nodule) or &#x201C;&#x9AD8;&#x5BC6;&#x5EA6;&#x5F71;&#x201D; (high-density shadow), among others, which are common but nonspecific in Chinese radiology reports, are not considered sufficient evidence on their own, though they may contribute to a metastatic inference when supported by additional contextual findings. This strategy is designed to minimize false positives. While this approach could theoretically increase false negatives, we conducted a comparative analysis using a more lenient interpretation strategy. This alternative approach removed key restrictive rules, including:</p><list list-type="order"><list-item><p>The requirement for cautious interpretation of terms such as &#x201C;&#x5C0F;&#x7ED3;&#x8282;&#x201D; (small nodule), &#x201C;&#x4F4E;&#x5BC6;&#x5EA6;&#x7ED3;&#x8282;&#x5F71;&#x201D; (low-density nodular shadow), &#x201C;&#x81F4;&#x5BC6;&#x5F71;&#x201D; (dense shadow), &#x201C;&#x9AD8;&#x5BC6;&#x5EA6;&#x5F71;&#x201D; (high-density shadow), &#x201C;&#x5F3A;&#x56DE;&#x58F0;&#x4F34;&#x58F0;&#x5F71;&#x201D; (strong echo with acoustic shadow), &#x201C;&#x8F6F;&#x5316;&#x7076;&#x201D; (softened focus), and &#x201C;&#x6761;&#x72B6;&#x5F3A;&#x5316;&#x5F71;&#x201D; (linear enhancement), preventing their direct classification as metastatic lesions.</p></list-item><list-item><p>To mainly consider the expressions of &#x201C;&#x8F6C;&#x79FB;&#x6027;&#x201D; (metastatic), &#x201C;&#x8003;&#x8651;&#x8F6C;&#x79FB;&#x201D; (consider metastasis), &#x201C;&#x7591;&#x4F3C;&#x8F6C;&#x79FB;&#x201D; (suspected metastasis), &#x201C;&#x5BC6;&#x5EA6;&#x589E;&#x9AD8;&#x5F71;&#x201D; (increased density shadow), or &#x201C;&#x9AD8;&#x5BC6;&#x5EA6;&#x7ED3;&#x8282;&#x201D; (high-density nodule) in other distant organs, as definitive evidence for metastasis.</p></list-item></list><p>The results of the comparison of 2 strategies on M staging are shown and discussed in the Clinical Impact Assessment subsection of the Model Evaluation section in Results and Discussion, respectively.</p><p>Finally, the overall clinical staging (Stage IA/IB, IIA/IIB, IIIA/IIIB/IIIC, or IVA/IVB) was algorithmically derived from the predicted T, N, and M staging results. This derivation strictly adhered to the stage grouping rules defined in the AJCC 8th Edition Cancer Staging Manual. The definitive mapping table from TNM combinations to clinical stages, as provided by the AJCC guidelines, is illustrated in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>American Joint Committee on Cancer tumor node metastasis staging guidelines: From T, N, and M descriptors to stage grouping (adapted from Goldstraw et al [<xref ref-type="bibr" rid="ref23">23</xref>], which is published under Creative Commons Attribution 4.0 International License [<xref ref-type="bibr" rid="ref24">24</xref>]). M: distant metastasis status; N: regional lymph node involvement; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig03.png"/></fig></sec><sec id="s2-5"><title>Large Language Model</title><p>The GLM series, developed by Tsinghua University and Zhipu AI, has been pretrained on approximately 10 trillion tokens of multilingual data, predominantly in Chinese and English [<xref ref-type="bibr" rid="ref25">25</xref>]. GLM-4-Air, an optimized variant of GLM-4 within the ChatGLM family, was selected for this study. This 32-billion-parameter model preserves the robust performance of its predecessor while offering reduced processing latency and lower inference costs, making it a more viable candidate for local deployment scenarios where data privacy is paramount, and the prohibitive hardware requirements of much larger models are a practical constraint.</p></sec><sec id="s2-6"><title>Prompt Optimization</title><sec id="s2-6-1"><title>Overview</title><p>To improve the performance of LLM in NSCLC TNM staging, we developed a series of prompts through iterative optimization. These prompts enhanced the model&#x2019;s ability to accurately extract and interpret key patient medical information and match TNM judgment criteria.</p><p>The structure of our prompt implemented a standardized 3-step process: Information Extraction, Standard Matching, and Structured Output. This approach ensured consistent processing across all cases while enhancing the model&#x2019;s reasoning capabilities. Notably, our output design incorporated the complete reasoning (chain-of-thought [CoT]) process rather than mere conclusions. This approach improved both reliability and interpretability of staging assessments, providing key information for result verification and analysis.</p><p>The entire optimization process was clearly divided into 4 phases, with each phase focusing on specific objectives and targeted improvements. Our strategy ensured systematic improvements at each stage, resulting in more precise and clinically applicable model performance.</p></sec><sec id="s2-6-2"><title>Phase 1: Establishing a Baseline With Fundamental TNM Criteria</title><p>In the initial phase, a baseline prompt incorporating fundamental TNM staging criteria was developed using CoT reasoning. Through this structured approach, we established a robust foundation for the model to understand and process TNM staging tasks systematically. In this phase, we integrated the TNM staging criteria (<xref ref-type="other" rid="box1">Textbox 1</xref>) into the prompt, elaborating on the specific judgment logic as follows:</p><boxed-text id="box1"><title> Tumor node metastasis (TNM) judgment criteria.</title><p>Primary tumor characteristics (T) staging</p><p>&#x2003;T4: Classified as T4 if the maximum tumor diameter exceeds 7 cm, if it invades specific anatomical structures, or if there are cancer nodules in different lobes of the same lung.</p><p>&#x2003;T3: Classified as T3 if the maximum tumor diameter is greater than 5 cm and less than or equal to 7 cm, if it invades other specified structures, or if there are isolated cancer nodules within the same lobe.</p><p>&#x2003;T2: Classified as T2 if the maximum tumor diameter is greater than 3 cm and less than or equal to 5 cm, if it invades the main bronchus or visceral pleura, or if there is tumor-induced atelectasis or obstructive pneumonia.</p><p>&#x2003;T1:</p><p>&#x2003;&#x2003;T1a: Classified as T1a if the maximum tumor diameter is less than or equal to 1 cm.</p><p>&#x2003;&#x2003;T1b: Classified as T1b if the maximum tumor diameter is greater than 1 cm and less than or equal to 2 cm.&#x2003;&#x2003;T1c: Classified as T1c if the maximum tumor diameter is greater than 2 cm and less than or equal to 3 cm.</p><p>&#x2003;&#x2003;Tx: Classified as Tx if the primary tumor cannot be evaluated.</p><p>Regional lymph node involvement (N) staging</p><p>&#x2003;N3: Classified as N3 if there is metastasis to contralateral mediastinal, contralateral hilar, ipsilateral or contralateral scalene, or supraclavicular lymph nodes.</p><p>&#x2003;N2: Classified as N2 if there is metastasis to ipsilateral mediastinal lymph nodes or subcarinal lymph nodes.</p><p>&#x2003;N1: Classified as N1 if there is metastasis to ipsilateral peribronchial lymph nodes and/or hilar lymph nodes, or if there is metastasis to ipsilateral intrapulmonary lymph nodes.</p><p>&#x2003;N0: Classified as N0 if none of the above conditions are met.</p><p>&#x2003;Nx: Classified as Nx if regional lymph node metastasis cannot be determined.</p><p>Distant metastasis status (M) staging</p><p>&#x2003;M1c: Classified as M1c if there are multiple metastases outside the thorax.</p><p>&#x2003;M1b: Classified as M1b if there is a single metastasis outside the thorax.</p><p>&#x2003;M1a: Classified as M1a if there is intrathoracic metastasis, metastasis to the contralateral lung, tumor accompanied by metastatic nodules in the pleura or pericardium, or malignant pleural effusion or pericardial effusion.</p><p>&#x2003;M0: Classified as M0 if there is no distant metastasis.</p><p>&#x2003;Mx: Classified as Mx if distant metastasis cannot be determined.</p></boxed-text></sec><sec id="s2-6-3"><title>Phase 2: Information Extraction From Medical Reports</title><p>This stage aimed to accurately extract lung cancer TNM staging information from medical reports to assist subsequent assessments. Through white-box testing, we found that the model frequently omitted key staging-related information from medical reports. To address this issue, we refined our prompt by explicitly listing all required information for extraction (<xref ref-type="other" rid="box2">Textbox 2</xref>). This enhancement improved the model&#x2019;s ability to identify and prioritize relevant details from diverse medical image reports.</p><p>Additionally, in this version of the prompt, we instructed the model to focus on key sections, such as the &#x201C;diagnostic opinion&#x201D; and &#x201C;conclusion&#x201D;, to concentrate the model&#x2019;s attention on essential information.</p><boxed-text id="box2"><title> Key information extraction.</title><p>Information requirements:</p><p>Extract the following information from the imaging report along with the rationale for the judgment:</p><p>Tumor size and location:</p><p>&#x2003;What is the maximum diameter of the tumor in cm?</p><p>&#x2003;Does the tumor grow across lobes?</p><p>&#x2003;In which lung lobe is the tumor located?</p><p>Cancer nodule situation:</p><p>&#x2003;What is the location and size of the cancer nodule (in cm)?</p><p>&#x2003;Is there a cancer nodule present in different lobes on the same side? Provide the rationale and reasoning for the judgment.</p><p>&#x2003;Is there a solitary cancer nodule present in the same lung lobe? Provide the rationale and reasoning for the judgment.</p></boxed-text></sec><sec id="s2-6-4"><title>Phase 3: Independently Handling T, N, and M Staging</title><p>In this critical phase, we designed prompts to enable the LLM to independently handle T, N, and M staging. We observed that the T, N, and M stages in the NSCLC TNM staging system corresponded to different judgment logic and medical knowledge. We also found that combining all staging criteria in a single prompt led to misinterpretation of different staging standards.</p><p>Therefore, we separated T, N, and M staging and developed specialized prompts for each category. This separation allowed the model to focus precisely on the unique criteria and nuances of each staging category, enhancing its accuracy and reliability.</p><p>Given the complexity of T staging, we particularly focused on refining its criteria. We broke down the matching conditions into prioritized steps, providing systematic guidance for T staging judgments (<xref ref-type="other" rid="box3">Textbox 3</xref>).</p><boxed-text id="box3"><title> Guidance for primary tumor characteristics (T) staging judgments.</title><p># &#x201D;T Staging Criteria and Workflow&#x201D;:</p><p>Step 1: Extract relevant information from the imaging report according to primary tumor assessment criteria.</p><p>Step 2: Assess for T4 stage. If any of the following conditions are met, classify as T4. If not, proceed to Step 3.</p><p>Condition 1: Tumor size 50&#x2010;70 mm with trans-lobar growth.</p><p>Condition 2: Invasion of any T4 anatomical structures.</p><p>Condition 3: Presence of cancer nodules in different ipsilateral lobes.</p><p>Step 3: Assess for T3 stage. If any of the following conditions are met, classify as T3. If not, proceed to Step 4.</p><p>Condition 1: Invasion of any T3 anatomical structures.</p><p>Condition 2: Presence of isolated cancer nodule in the same lobe.</p><p>Condition 3: Tumor size &#x003C;50 mm with trans-lobar growth.</p><p>Step 4: Assess for T2 stage. If the following condition is met, classify as T2. If not, proceed to Step 5.</p><p>Condition 1: Presence of atelectasis or obstructive pneumonia extending to hilar region.</p><p>Step 5: If none of the above conditions are met but tumor size is available, classify based on the following criteria:</p><list list-type="bullet"> <list-item><p>&#x2264;10 mm: T1a</p></list-item> <list-item><p>&#x003E;10 mm to &#x2264;20 mm: T1b</p></list-item> <list-item><p>&#x003E;20 mm to &#x2264;30 mm: T1c</p></list-item> <list-item><p>&#x003E;30 mm to &#x2264;40 mm: T2a</p></list-item> <list-item><p>&#x003E;40 mm to &#x2264;50 mm: T2b</p></list-item> <list-item><p>&#x003E;50 mm to &#x2264;70 mm: T3</p></list-item> <list-item><p>&#x003E;70 mm: T4</p></list-item> </list><p>Step 6: If no primary tumor is mentioned, classify as T0.</p></boxed-text></sec><sec id="s2-6-5"><title>Phase 4: Optimizing for Detailed Clinical Standards</title><p>In the final phase, we optimized the prompts to align with more comprehensive clinical standards as mentioned in the Criteria for TNM and Clinical Staging subsection. We analyzed and categorized the causes of model misjudgments and compiled them as background knowledge. This knowledge was then injected into the model before judgment to decrease the error rate. For instance, we emphasized the detailed criteria of &#x201C;Tumor Invasion,&#x201D; &#x201C;Cancer Nodule,&#x201D; and &#x201C;Lymph Node Metastasis&#x201D; (supplementary rules of T0 in <xref ref-type="table" rid="table1">Table 1</xref>). These enhancements improved the model&#x2019;s capacity to process complex medical narratives.</p><p>We also introduced supplementary judgment rules, such as clearly defining the invasion sites for different stages (supplementary rules of T2, T3, and T4 in <xref ref-type="table" rid="table1">Table 1</xref>), and formulated the corresponding prompt as follows (<xref ref-type="other" rid="box4">Textbox 4</xref>). These supplements helped reduce ambiguity and enhance staging consistency and accuracy.</p><boxed-text id="box4"><title> Examples of supplementary judgment rules.</title><p>T4 staging invasion sites:</p><p>Does the report clearly indicate that the tumor invades any of the following structures: diaphragm, mediastinum, heart, major vessels (aorta, superior vena cava, inferior vena cava, main pulmonary arteries, and left or right pulmonary veins within the pericardium), trachea, recurrent laryngeal nerve, esophagus, vertebral bodies, or carina? Provide the specific evidence and rationale for the judgment.</p><p>T3 staging invasion sites:</p><p>Does the report clearly indicate that the tumor invades the parietal pleura (usually noted in pathological staining), chest wall (including Pancoast tumors), phrenic nerve, or pericardium? Provide the specific evidence and rationale for the judgment.</p><p>Does the report clearly indicate that the tumor invades the main stems of more distal arteries or veins? Provide the specific evidence and rationale for the judgment.</p><p>T2 staging invasion sites:</p><p>Does the report clearly indicate that the tumor invades the left or right main bronchus or visceral pleura? Provide specific evidence and rationale for the judgment.</p></boxed-text><p>Through comprehensive and iterative white-box testing, the deficiencies of the LLM in NSCLC TNM staging were identified. During this process, we optimized the prompts to address these deficiencies, thereby guiding the staging process of the LLM to enhance its performance. This improvement addressed problems including incomplete information extraction, inadequate handling of TNM staging complexities, and limited comprehension of medical terminology.</p><p>Despite these improvements, the model still faced challenges in numerical calculations and prompt adherence. These issues affected the accuracy of staging results, particularly in complex T staging. These limitations were addressed in the subsequent supervised fine-tuning (SFT) phase.</p></sec></sec><sec id="s2-7"><title>SFT</title><p>The SFT process adjusts the model parameters using supervised learning techniques. This allows the model to adapt to specific data distributions and task requirements, leading to improved performance. In this study, we used low-rank adaptation (LoRA) technology for SFT on the GLM-4-Air model. LoRA is a parameter-efficient fine-tuning technique. It reduces computational complexity and memory requirements by decomposing the weight update matrix while maintaining fine-tuning effectiveness [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>Based on our annotated training dataset, we reprocessed the stage labels. To be specific, the stage label was reprocessed to the reasoning process consistent with the optimized prompt instructions by professional physicians (<xref ref-type="other" rid="box5">Textbox 5</xref>). This made the training data more comprehensible to the GLM-4-Air model, providing a solid foundation for model training.</p><boxed-text id="box5"><title> Example of annotated training data for primary tumor characteristics (T) staging model.</title><p>Original data:</p><p>Content: Medical Imaging Report Content</p><p>Label: Stage T2a</p><p>Re-processed data for supervised fine-tuning (SFT):</p><p>{</p><p>&#x2003;&#x201C;messages&#x201D;: [</p><p>&#x2003;&#x2003;&#x2003;&#x2003;{</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;role&#x201D;: &#x201C;system,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;content&#x201D;: &#x201C;You are a senior chief physician specializing in non-small cell lung cancer diagnosis.&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;},</p><p>&#x2003;&#x2003;&#x2003;&#x2003;{</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;role&#x201D;: &#x201C;user,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;content&#x201D;: &#x201C;Prompt and Medical Imaging Report Content.&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;},</p><p>&#x2003;&#x2003;&#x2003;&#x2003;{</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;role&#x201D;: &#x201C;assistant,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;content&#x201D;: {</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;report_extraction_results&#x201D;: {</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;tumor_size_and_location&#x201D;: {</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;maximum_tumor_diameter(mm)&#x201C;: &#x201C;39,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;tumor_cross_lobe_growth&#x201D;: &#x201C;No,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;tumor_lobe_location&#x201D;: &#x201C;Left lower lobe&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; },</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;cancer_nodule_status&#x201D;: {</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;nodule_location_and_size(mm)&#x201C;: &#x201C;None,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;ipsilateral_different_lobe_nodules&#x201D;: &#x201C;No,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;isolated_nodule_same_lobe&#x201D;: &#x201C;No&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; },</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;T4_stage_invasion_sites&#x201D;: &#x201C;None,&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;T3_stage_invasion_sites&#x201D;: &#x201C;None&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; },</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;T_staging&#x201D;: {</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x2003; &#x201C;reasoning_process&#x201D;: "Based on the imaging report, the tumor&#x2019;s maximum diameter is 3.9 cm, located in the left lower lobe. No invasion of T4 stage structures is mentioned, and no cancer nodules are found in ipsilateral different lobes, which does not meet T4 criteria. The tumor diameter is less than 5 cm, with no invasion of T3 stage structures mentioned and no isolated nodules in the same lobe, which does not meet T3 criteria. The tumor diameter is greater than 3 cm but less than or equal to 4 cm, with no mention of the tumor causing atelectasis or obstructive pneumonia extending to the hilar region; therefore, it is staged as T2a.",</p><p>&#x2003;&#x2003;&#x2003;&#x2003;&#x2003;&#x201C;reasoning_conclusion&#x201D;: &#x201C;Stage T2a&#x201D;</p><p>&#x2003;&#x2003;&#x2003;&#x2003; }</p><p>&#x2003;&#x2003;&#x2003; }</p><p>&#x2003;&#x2003; }</p><p>&#x2003;]</p><p>}</p></boxed-text><p>To deal with the potential risks of overfitting and catastrophic forgetting associated with the limited sample size, we introduced a key design in the SFT stage for the core task. Specifically, we constructed a heterogeneous instruction-tuning dataset by blending the 292 target-domain cases with approximately 2000 general-purpose instruction samples covering diverse tasks such as mathematics, summarization, and translation. The details of the categories, exact counts, proportions, and specific examples for each category of the instruction samples are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. Based on our empirical assessment, these components were mixed at a ratio of approximately 1:7. This strategy was designed to leverage a multi-task learning framework, compelling the model to adapt to the specific medical task while simultaneously preserving and reinforcing its general instruction-following and reasoning capabilities, thereby enhancing its generalization stability.</p><p>In the fine-tuning phase, we used the LoRA method for SFT with the following parameters (<xref ref-type="table" rid="table2">Table 2</xref>), and the GPU we used in this task was NVIDIA A800.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Supervised fine-tuning parameter settings.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top">Rank</td><td align="left" valign="top">16</td></tr><tr><td align="left" valign="top">LoRa<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> dropout</td><td align="left" valign="top">0.05</td></tr><tr><td align="left" valign="top">Learning rate</td><td align="left" valign="top">&#x2248;2.5&#x00D7;10<sup>&#x2013;4</sup>-5&#x00D7;10<sup>&#x2013;4</sup></td></tr><tr><td align="left" valign="top">Batch size</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Precision</td><td align="left" valign="top">FP16</td></tr><tr><td align="left" valign="top">Scheduler</td><td align="left" valign="top">Linear Decay</td></tr><tr><td align="left" valign="top">Random seeds</td><td align="left" valign="top">1102</td></tr><tr><td align="left" valign="top">Number of trainable parameters</td><td align="left" valign="top">about 4M</td></tr><tr><td align="left" valign="top">Stopping criteria</td><td align="left" valign="top">3 epoch</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>LoRA: low-rank adaptation.</p></fn></table-wrap-foot></table-wrap><p>These parameters were carefully tuned to ensure efficient training and stable performance of the model.</p><p>During the SFT process, we used white-box testing to identify and analyze deficiencies in the model&#x2019;s medical knowledge and capabilities. Based on these findings, we conducted targeted parameter optimization and fine-tuning to enhance its TNM staging performance. The results showed that this refined tuning strategy, based on continuous feedback and optimization, improved the model&#x2019;s accuracy and stability in NSCLC TNM staging.</p></sec><sec id="s2-8"><title>Model Evaluation</title><sec id="s2-8-1"><title>Overview</title><p>Both models were evaluated on the held-out test sets (comprising both the White-Box and Black-Box sets). To ensure a comprehensive, unbiased, and statistically robust comparison of the performance between the GLM-4-Air and GPT-4o models, we used a standardized framework for evaluation. This framework encompasses performance metrics, statistical comparison methods, clinical impact assessment, and cost-effective evaluation, as detailed below.</p><p>It is important to clarify that all the evaluations were performed under the same prompts, test datasets, and evaluation environments to ensure fairness and validity. As a concrete specification of these standardized conditions, the key inference configuration parameters, including temperature, top_p, max_tokens, and max_attempts, used for both GLM-4-Air and GPT-4o are detailed in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Inference configuration table for both models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">Value</td></tr></thead><tbody><tr><td align="left" valign="top">temperature</td><td align="left" valign="top">0.01</td></tr><tr><td align="left" valign="top">top_p</td><td align="left" valign="top">0.01</td></tr><tr><td align="left" valign="top">max_tokens</td><td align="left" valign="top">8192</td></tr><tr><td align="left" valign="top">max_attempts</td><td align="left" valign="top">3</td></tr></tbody></table></table-wrap></sec><sec id="s2-8-2"><title>Performance Metrics</title><sec id="s2-8-2-1"><title>Accuracy</title><p>The overall proportion of correct predictions compared with clinical annotation made by a model for a given stage and clinical stage. The mapping from TNM staging to the overall clinical stage was strictly determined according to the AJCC 8th edition criteria (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Note that a theoretically &#x201C;correct&#x201D; clinical stage group could be generated from an incorrect combination of individual T, N, or M predictions. To ensure transparency and to help validate clinical safety beyond simple accuracy percentages, we also calculate and report the &#x201C;Exact Match Ratio,&#x201D; which represents the percentage of cases where T, N, and M are all correctly predicted for the same patient.</p></sec><sec id="s2-8-2-2"><title>Confidence Intervals</title><p>CIs were used to quantify the uncertainty of the evaluation results, reflecting the precision of the performance metric estimates. The 95% CIs calculated in this study indicate that we can be 95% confident that the model&#x2019;s true performance on a larger, similarly distributed test set would fall within this range.</p></sec><sec id="s2-8-2-3"><title>Confusion Matrices</title><p>The confusion matrices of both models in both white-box and black-box were calculated to show the specific misclassifications of TNM stagings.</p></sec><sec id="s2-8-2-4"><title>Precision</title><p>Precision is defined as the proportion of true positive cases among all samples predicted as positive by the model. This metric evaluates the accuracy of the model&#x2019;s positive predictions. For instance, in M1 staging, precision represents the proportion of cases correctly classified as M1 among all cases predicted by the model as M1.</p></sec><sec id="s2-8-2-5"><title>Recall</title><p>Recall is defined as the proportion of true positive cases correctly identified by the model among all actual positive samples. This metric evaluates the completeness of the model in identifying relevant instances. For example, in M1 staging, recall represents the proportion of true M1 cases that are correctly identified by the model.</p></sec><sec id="s2-8-2-6"><title><italic>F</italic><sub>1</sub>-score</title><p>The <italic>F</italic><sub>1</sub>-score represents the harmonic mean of precision and recall, providing a single composite metric that balances both the model&#x2019;s accuracy in positive predictions (precision) and its completeness in identifying relevant instances (recall). For the multi-class classification tasks (T, N, and M staging), we also report the macro-average <italic>F</italic><sub>1</sub>-score, which treats all classes equally and offers a perspective unbiased by class distribution. The formula for calculating the <italic>F</italic><sub>1</sub>-score is as follows,</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mfrac><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>+</mml:mo><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula></sec></sec></sec><sec id="s2-9"><title>Statistical Comparison</title><p>To determine if the observed performance differences between models were statistically significant, we applied paired significance testing. Specifically, for the key metric of classification accuracy, we performed McNemar&#x2019;s test to compare the paired categorical outcomes of our final model (GLM-4-Air) and the baseline model (GPT-4o).</p><p>To calculate a standardized measure of the strength of association between categorical variables, we used the statistic Cohen &#x03C9; [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], defined by the following formula. This metric provides a standardized effect size that is independent of sample size, facilitating comparisons across studies.</p><p>The formula for Cohen &#x03C9; is:</p><disp-formula id="E2"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>&#x03C9;</mml:mi><mml:mo>=</mml:mo><mml:msqrt><mml:mfrac><mml:msup><mml:mi>&#x03C7;</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mi>n</mml:mi></mml:mfrac></mml:msqrt></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>&#x03C7;</italic>&#x00B2; is the chi-square test statistic, reflecting the overall deviation of the observed data from the expected distribution, and n is the total sample size.</p><p>According to Cohen's conventional benchmarks, the effect size can be interpreted as follows: <italic>&#x03C9;</italic> of 0.1 indicates a small effect, <italic>&#x03C9;</italic> of 0.3 a medium effect, and <italic>&#x03C9;</italic> of 0.5 a large effect. This statistic provides an objective assessment of the practical importance of observed differences or associations, moving beyond reliance solely on the statistical significance of <italic>P</italic> values.</p></sec><sec id="s2-10"><title>Clinical Impact Assessment</title><p>In clinical practice, the consequences of a misclassification in TNM staging are not uniform; an error can range from being inconsequential to leading to significant deviations in treatment planning. Therefore, beyond conventional performance metrics, we conducted a clinical impact analysis to evaluate the practical implications of model errors.</p><p>Based on the confusion matrices, all misclassifications were systematically categorized into 3 tiers according to their potential impact on clinical decision-making as follows:</p><list list-type="bullet"><list-item><p>Category I (Major Errors): These are the most severe errors where the TNM misclassification results in a significant change in the overall clinical stage, substantially altering both diagnostic evaluation and treatment strategy (eg, shifting from a curative to a palliative intent).</p></list-item><list-item><p>Category II (Moderate Errors): These errors are of moderate severity. The TNM misclassification may sometimes lead to a change in clinical stage, potentially impacting the choice of subsequent therapy (eg, altering the recommended adjuvant treatment regimen), but without a fundamental shift in the treatment goal.</p></list-item><list-item><p>Category III (Minor Errors): These are the least severe errors. The TNM misclassification has a minimal to negligible impact on the final clinical stage and the selection of treatment options, typically involving substage distinctions that do not change the standard of care.</p></list-item></list><p>All erroneous predictions made by the models on the test set were retrospectively reviewed, classified according to this 3-tiered system, and quantified. This analysis provides a crucial safety-centric perspective on model performance, assessing its reliability and potential for adoption in real-world clinical workflows.</p><p>In addition, a detailed breakdown of error examples and their categorization rationale is provided in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Taxonomy of primary tumor characteristics (T), regional lymph node involvement (N), and distant metastasis status (M) staging errors with exemplars and rationale.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Error category and specific error types</td><td align="left" valign="bottom">Clinical impact</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">I - Major errors</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between M0 and M1</td><td align="left" valign="top">Misjudgment of tumor metastasis status, leading to incorrect clinical staging, can significantly impact the selection of treatment options.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between T0 and Tn</td><td align="left" valign="top">Misinterpretation of nodules as tumors or tumors as benign nodules, as well as failure to detect tumors, can lead to incorrect clinical staging and may significantly impact the selection of treatment approaches.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between T1/T4, or T2/T4</td><td align="left" valign="top">Discrepancies in staging (Stage I/III or II/III) observed with N0, or discrepancies in staging (Stage II/III) observed with N1, can lead to incorrect staging results, while tumors classified as N2 and N3 are uniformly staged as Stage III. However, all the aforementioned scenarios may influence the selection of treatment approaches.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between N0/N2 or N0/N3</td><td align="left" valign="top">Discrepancies in staging (Stage I/III or II/III) observed with T1, T2, and T3 can lead to incorrect staging results, while tumors classified as T4 are uniformly staged as Stage III. However, all the aforementioned scenarios may influence the selection of the extent of lymph node dissection.</td></tr><tr><td align="left" valign="top" colspan="2">II - Moderate errors</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between T1 and T2</td><td align="left" valign="top">Aside from the potential for staging discrepancies between Stage I and II when classified as N0, other scenarios, while not resulting in incorrect staging outcomes, may influence the selection of treatment approaches.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between T1/T3 or T2/T3</td><td align="left" valign="top">Aside from the staging discrepancies (Stage I/II) observed with N0 or the staging discrepancies (Stage II/III) observed with N1, other factors will not lead to incorrect staging results but may influence the selection of treatment approaches.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between T3 and T4</td><td align="left" valign="top">Aside from the staging discrepancies (Stage IIB/IIIA) observed with N0, other factors will not lead to incorrect staging results, but may influence the selection of radiotherapy and chemotherapy.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between N0 and N1</td><td align="left" valign="top">Apart from the staging discrepancies (Stage I/II) observed with T1 and T2a, the staging discrepancies (Stage II/III) observed with T3, or the discrepancies within the same major stage (Stage IIA/IIB) observed with T2b, other factors (including T4) will not lead to incorrect staging results, but all the aforementioned scenarios may influence the selection of treatment approaches.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between N1/N2 or N1/N3</td><td align="left" valign="top">Apart from the staging discrepancies (Stage IIB/IIIA) observed with T1 and T2, tumors classified as T3 and T4 are uniformly staged as Stage III. However, all the aforementioned scenarios may influence the selection of treatment approaches.</td></tr><tr><td align="left" valign="top" colspan="2">III - Minor errors</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification among M1 Subcategories (M1a, M1b, and M1c)</td><td align="left" valign="top">All cases were clinically staged as Stage IV, but the specific metastatic pattern may influence the selection of treatment approaches for extrathoracic metastases.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification among T1 Subcategories (T1a, T1b, and T1c)</td><td align="left" valign="top">Does not cause a staging error (in this case, clinical stage is determined by N or M). Typically, it does not affect treatment.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misclassification between N2 and N3</td><td align="left" valign="top">In this case, all clinical stages are Stage III. It only minimally affects refined adjustments within the treatment strategy, rather than the fundamental selection of the treatment regimen.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Any misclassification involving Tx, Nx, or Mx categories</td><td align="left" valign="top">The clinical stage cannot be determined and requires further evaluation based on the disease presentation.</td></tr></tbody></table></table-wrap></sec><sec id="s2-11"><title>Cost-Effective Evaluation</title><p>To assess the model&#x2019;s deployment feasibility on accessible hardware, we performed a local performance benchmark using consumer-grade GPUs (Four NVIDIA GeForce RTX 4090, 24 GB VRAM each). The GLM-4-Air model was loaded at full precision with an 8K token context length, and its inference latency for each TNM staging component was measured on the black-box test set.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>OCR and Interannotator Agreement</title><p>The Chinese character error rates on the randomly selected 27 reports from white-box and black-box settings were 0.26% and 0.22%, respectively, which is consistent with the data provided by the supplier [<xref ref-type="bibr" rid="ref22">22</xref>]. Manual review confirmed that no serious errors were found in the extraction of core information critical for TNM staging, such as tumor size, nodal status, or metastasis presence, in all cases. Specifically, the primary errors in this study were confined to issues like erroneous line breaks, spurious spaces between characters, omitted/misplaced punctuation, and poor paragraph segmentation, none of which impacted the model&#x2019;s TNM staging decisions. The detailed results of the post-OCR Chinese character error rate test and examples of common OCR error types can be found in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><p>The result of the calculation of interannotator agreement using the Cohen kappa coefficient among the clinical annotators is shown in <xref ref-type="table" rid="table5">Table 5</xref>. For the white-box set, Kappa values were 0.714 (T stage), 0.755 (N stage), and 0.778 (M stage). In the black-box set, agreement levels were 0.851 (T stage), 0.796 (N stage), and 0.796 (M stage). All Kappa values exceeded 0.70, indicating substantial agreement across all staging categories in both test sets. The T stage demonstrated the most pronounced improvement in interrater reliability between white-box and black-box evaluations, while N and M stages maintained consistently high agreement levels across both settings.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Interannotator agreement based on Cohen's Kappa coefficient.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Data set</td><td align="left" valign="bottom">T<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup> kappa value (&#x03BA;)</td><td align="left" valign="bottom">N<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> kappa value (&#x03BA;)</td><td align="left" valign="bottom">M<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup> kappa value (&#x03BA;)</td></tr></thead><tbody><tr><td align="left" valign="top">White-box</td><td align="left" valign="top">0.714</td><td align="left" valign="top">0.755</td><td align="left" valign="top">0.778</td></tr><tr><td align="left" valign="top">Black-box</td><td align="left" valign="top">0.851</td><td align="left" valign="top">0.796</td><td align="left" valign="top">0.796</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>T: primary tumor characteristics.</p></fn><fn id="table5fn2"><p><sup>b</sup>N: regional lymph node involvement.</p></fn><fn id="table5fn3"><p><sup>c</sup>M: distant metastasis status.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Prompt Optimization</title><p>The results of this study demonstrate a general and substantial improvement in staging accuracy across successive prompt iterations. While an isolated decrease was observed in M staging from Version 1 to Version 2, all T and N stages, as well as the overall clinical stage, exhibited a consistent upward trend in accuracy from Version 1 through Version 4. <xref ref-type="fig" rid="figure4">Figure 4</xref> and <xref ref-type="table" rid="table6">Table 6</xref> compare the accuracy rates across 4 versions of prompts (0705 v, 0708 v, 0715 v, and 0801 v) for T, N, M, and clinical staging. The accuracy rates improved across all staging categories: T staging rose from 18.7% to 65%, N staging from 52.5% to 89%, and M staging from 56.2% to 90%. These improvements led to an increase in overall clinical staging accuracy from 44% to 82%.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Comparison of accuracy across iterative prompt versions. M: distant metastasis status; N: regional lymph node involvement; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig04.png"/></fig><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Accuracy of primary tumor characteristics (T), regional lymph node involvement (N), distant metastasis status (M), and clinical staging of iterative prompt versions.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">T staging (%)</td><td align="left" valign="bottom">N staging (%)</td><td align="left" valign="bottom">M staging (%)</td><td align="left" valign="bottom">Clinical staging (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Version 1</td><td align="left" valign="top">18.7</td><td align="left" valign="top">52.5</td><td align="left" valign="top">56.2</td><td align="left" valign="top">44</td></tr><tr><td align="left" valign="top">Version 2</td><td align="left" valign="top">48.9</td><td align="left" valign="top">69.1</td><td align="left" valign="top">51.0</td><td align="left" valign="top">56.4</td></tr><tr><td align="left" valign="top">Version 3</td><td align="left" valign="top">58</td><td align="left" valign="top">83</td><td align="left" valign="top">81</td><td align="left" valign="top">76</td></tr><tr><td align="left" valign="top">Version 4</td><td align="left" valign="top">65</td><td align="left" valign="top">89</td><td align="left" valign="top">90</td><td align="left" valign="top">82</td></tr></tbody></table></table-wrap><p>The notable improvements occurred in 2 instances: first, between version 1 and version 2 for T staging, with accuracy increasing by 30.2% following the implementation of structured information extraction; second, between Version 2 and Version 3 for M staging, through the implementation of diverse prompts for independent T, N, and M staging. The data also showed that M and N staging maintained consistently higher accuracy than T staging across all versions.</p></sec><sec id="s3-3"><title>SFT</title><p>Analysis of the LoRA fine-tuning results (<xref ref-type="fig" rid="figure5">Figures 5</xref><xref ref-type="fig" rid="figure6"/>-<xref ref-type="fig" rid="figure7">7</xref>) led to the selection of optimal models for T and N staging, with learning rates of 4.5e-4 and 2.5e-4, respectively. For M staging, as no fine-tuned models exceeded the original model&#x2019;s performance, the baseline GLM-4-Air model was retained, maintaining its accuracy of 90%.</p><p>The results demonstrated the effectiveness of the training process, particularly for T and N staging tasks. The accuracy of T staging improved from 65% to 91%, representing a substantial increase of 26%.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Supervised fine-tuning effects corresponding to learning rates: T staging accuracy. T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Supervised fine-tuning effects corresponding to learning rates: N staging accuracy. N: regional lymph node involvement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig06.png"/></fig><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Supervised fine-tuning effects corresponding to learning rates: M staging accuracy. M: distant metastasis status.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig07.png"/></fig></sec><sec id="s3-4"><title>Model Evaluation</title><sec id="s3-4-1"><title>Performance Metrics</title><p>In the black-box test, the GLM-4-Air model with SFT demonstrated superior performance compared with the GPT-4o model across all categories. Specifically, the GLM-4-Air with SFT achieved accuracy scores of 92% for T staging, 86% for N staging, 92% for M staging, and 90% for clinical staging. As a comparison, the GPT-4o recorded accuracy scores of 87%, 70%, 78%, and 80% for T, N, M, and clinical staging, respectively (<xref ref-type="table" rid="table7">Table 7</xref>). This performance aligned with the white-box test results. The GLM-4-Air model with SFT consistently showed higher accuracy across all stages compared with GPT-4o. It should be noted that the Clinical Staging accuracy is a derivative metric, calculated based on the combined T, N, and M outputs according to the mapping table (<xref ref-type="fig" rid="figure3">Figure 3</xref>), rather than a direct, independent prediction of the model.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Performance comparison of different models in non-small cell lung cancer tumor node metastasis staging across White-box and Black-box tests. GLM-4-Air (general language model) (original): The base GLM-4-Air model without supervised fine-tuning; GLM-4-Air (supervised fine-tuning [SFT]): The GLM-4-Air model optimized through our training framework; GPT-4o: A large language model released on May 13, 2024.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Data set and model</td><td align="left" valign="bottom" colspan="5">Accuracy(%)</td></tr><tr><td align="left" valign="top"/><td align="left" valign="bottom">T<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup> staging</td><td align="left" valign="bottom">N<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup> staging</td><td align="left" valign="bottom">M<sup><xref ref-type="table-fn" rid="table7fn3">c</xref></sup> staging</td><td align="left" valign="bottom">Exact match</td><td align="left" valign="bottom">Clinical staging</td></tr></thead><tbody><tr><td align="left" valign="top">White-box</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GLM-4-Air (Original)</td><td align="left" valign="top">65</td><td align="left" valign="top">89</td><td align="left" valign="top">90</td><td align="left" valign="top">54</td><td align="left" valign="top">80</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GLM-4-Air (SFT)</td><td align="left" valign="top">91</td><td align="left" valign="top">92</td><td align="left" valign="top">90</td><td align="left" valign="top">80</td><td align="left" valign="top">93</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="left" valign="top">86</td><td align="left" valign="top">80</td><td align="left" valign="top">70</td><td align="left" valign="top">61</td><td align="left" valign="top">77</td></tr><tr><td align="left" valign="top">Black-box</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GLM-4-Air (SFT)</td><td align="left" valign="top">92</td><td align="left" valign="top">86</td><td align="left" valign="top">92</td><td align="left" valign="top">77</td><td align="left" valign="top">90</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="left" valign="top">87</td><td align="left" valign="top">70</td><td align="left" valign="top">78</td><td align="left" valign="top">64</td><td align="left" valign="top">80</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>T: primary tumor characteristics.</p></fn><fn id="table7fn2"><p><sup>b</sup>N: regional lymph node involvement.</p></fn><fn id="table7fn3"><p><sup>c</sup>M: distant metastasis status.</p></fn></table-wrap-foot></table-wrap><p>As shown in <xref ref-type="fig" rid="figure8">Figures 8</xref> and <xref ref-type="fig" rid="figure9">9</xref>, the GLM-4-Air (SFT) demonstrated superior performance in white-box testing across all staging categories compared with both its original version and GPT-4o. This was maintained in black-box testing, with GLM-4-Air (SFT) consistently outperforming GPT-4o. The robust performance validated the effectiveness of the fine-tuning approach in enhancing the model&#x2019;s staging capabilities.</p><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>The accuracy of GLM-4-Air (original and SFT) and GPT-4o in the white-box test. GLM: general language model; M: distant metastasis status; N: regional lymph node involvement; SFT: supervised fine-tuning; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig08.png"/></fig><fig position="float" id="figure9"><label>Figure 9.</label><caption><p>The accuracy of GLM-4-Air (SFT) and GPT-4o in the black-box test. GLM: general language model; M: distant metastasis status; N: regional lymph node involvement; SFT: supervised fine-tuning; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig09.png"/></fig><p>The calculated 95% CIs are presented in <xref ref-type="table" rid="table8">Table 8</xref> below.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Comparison of accuracy (95% CI) for tumor node metastasis staging of GLM-4-Air (general language model) and GPT-4o.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Model and date set</td><td align="left" valign="top">Stage</td><td align="left" valign="top">Accuracy (%)</td><td align="left" valign="top">Accuracy (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">GLM-4-Air 32B (original)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White-box</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">T<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup></td><td align="char" char="." valign="top">65</td><td align="char" char="." valign="top">0.552&#x2010;0.736</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">N<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td><td align="char" char="." valign="top">89</td><td align="char" char="." valign="top">0.814&#x2010;0.938</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">M<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup></td><td align="char" char="." valign="top">90</td><td align="char" char="." valign="top">0.826&#x2010;0.945</td></tr><tr><td align="left" valign="top">GLM-4-Air 32B (SFT)<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White-box</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">T</td><td align="char" char="." valign="top">91</td><td align="char" char="." valign="top">0.838&#x2010;0.952</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">N</td><td align="char" char="." valign="top">92</td><td align="char" char="." valign="top">0.850&#x2010;0.959</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">M</td><td align="char" char="." valign="top">90</td><td align="char" char="." valign="top">0.826&#x2010;0.945</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Black-box</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">T</td><td align="char" char="." valign="top">92</td><td align="char" char="." valign="top">0.850&#x2010;0.959</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">N</td><td align="char" char="." valign="top">86</td><td align="char" char="." valign="top">0.779&#x2010;0.915</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">M</td><td align="char" char="." valign="top">92</td><td align="char" char="." valign="top">0.850&#x2010;0.959</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White-box</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">T</td><td align="char" char="." valign="top">86</td><td align="char" char="." valign="top">0.779&#x2010;0.915</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">N</td><td align="char" char="." valign="top">80</td><td align="char" char="." valign="top">0.711&#x2010;0.867</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">M</td><td align="char" char="." valign="top">70</td><td align="char" char="." valign="top">0.604&#x2010;0.781</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Black-box</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">T</td><td align="char" char="." valign="top">87</td><td align="char" char="." valign="top">0.790&#x2010;0.922</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">N</td><td align="char" char="." valign="top">70</td><td align="char" char="." valign="top">0.604&#x2010;0.781</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">M</td><td align="char" char="." valign="top">78</td><td align="char" char="." valign="top">0.689&#x2010;0.850</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>T: primary tumor characteristics.</p></fn><fn id="table8fn2"><p><sup>b</sup>N: regional lymph node involvement.</p></fn><fn id="table8fn3"><p><sup>c</sup>M: distant metastasis status.</p></fn><fn id="table8fn4"><p><sup>d</sup>SFT: supervised fine-tuning.</p></fn></table-wrap-foot></table-wrap><p>When comparing GLM-4-Air before and after SFT on the white-box test set, the CI for T staging significantly narrowed, improving from 0.552&#x2010;0.736 to 0.838&#x2010;0.952. A similar narrowing was observed for N staging, with the interval improving from 0.814&#x2010;0.938 to 0.850&#x2010;0.959. The CI for M staging remained unchanged due to the model not being fine-tuned for this specific category.</p><p>In the white-box test comparison between GLM-4-Air and GPT-4o, the CIs for GLM-4-Air were superior to those of GPT-4o across all individual TNM categories. This pattern was consistent in the black-box test comparison, where GLM-4-Air again demonstrated higher CIs than GPT-4o for each TNM category.</p><p>The results of the confusion matrices of both models in white-box and black-box were shown in <xref ref-type="fig" rid="figure10">Figures 10</xref><xref ref-type="fig" rid="figure11"/><xref ref-type="fig" rid="figure12"/>-<xref ref-type="fig" rid="figure13">13</xref> below.</p><fig position="float" id="figure10"><label>Figure 10.</label><caption><p>Confusion matrix for T, N, and M staging by GLM-4-Air in white-box. GLM: general language model; M: distant metastasis status; N: regional lymph node involvement; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig10.png"/></fig><fig position="float" id="figure11"><label>Figure 11.</label><caption><p>Confusion matrix for T, N, and M staging by GPT-4o in white-box. M: distant metastasis status; N: regional lymph node involvement; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig11.png"/></fig><fig position="float" id="figure12"><label>Figure 12.</label><caption><p>Confusion matrix for T, N, and M staging by GLM-4-Air in a black box. GLM: general language model; M: distant metastasis status; N: regional lymph node involvement; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig12.png"/></fig><fig position="float" id="figure13"><label>Figure 13.</label><caption><p>Confusion matrix for T, N, and M staging by GPT-4o in a black box. M: distant metastasis status; N: regional lymph node involvement; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig13.png"/></fig><p>Based on the confusion matrices, we calculated the precision, recall, and <italic>F</italic><sub>1</sub>-score for all subcategories of the T, N, and M stages for both models in the white-box and black-box test sets, as shown in <xref ref-type="table" rid="table9">Tables 9</xref><xref ref-type="table" rid="table10"/>-<xref ref-type="table" rid="table11">11</xref> and <xref ref-type="fig" rid="figure14">Figures 14</xref><xref ref-type="fig" rid="figure15"/><xref ref-type="fig" rid="figure16"/><xref ref-type="fig" rid="figure17"/><xref ref-type="fig" rid="figure18"/>-<xref ref-type="fig" rid="figure19">19</xref>.</p><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Comparison of precision, recall, and <italic>F</italic><sub>1</sub>-score for primary tumor characteristics (T) staging between GLM-4-Air (general language model) and GPT-4o.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and T staging</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">GLM-4-Air (white-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">50.0</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a</td><td align="left" valign="top">75.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.857</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1b</td><td align="left" valign="top">100.0</td><td align="left" valign="top">90.0</td><td align="left" valign="top">0.947</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1c</td><td align="left" valign="top">86.7</td><td align="left" valign="top">92.9</td><td align="left" valign="top">0.897</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2a</td><td align="left" valign="top">91.7</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.956</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2b</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td><td align="left" valign="top">83.3</td><td align="left" valign="top">71.4</td><td align="left" valign="top">0.769</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td><td align="left" valign="top">92.0</td><td align="left" valign="top">92.0</td><td align="left" valign="top">0.920</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tx</td><td align="left" valign="top">85.7</td><td align="left" valign="top">85.7</td><td align="left" valign="top">0.857</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">91.4</td><td align="left" valign="top">88.2</td><td align="left" valign="top">0.887</td></tr><tr><td align="left" valign="top">GPT-4o (white-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top">50.0</td><td align="left" valign="top">50.0</td><td align="left" valign="top">0.500</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1b</td><td align="left" valign="top">90.0</td><td align="left" valign="top">90.0</td><td align="left" valign="top">0.900</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1c</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td><td align="left" valign="top">75.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.857</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2a</td><td align="left" valign="top">100.0</td><td align="left" valign="top">90.9</td><td align="left" valign="top">0.952</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2b</td><td align="left" valign="top">88.9</td><td align="left" valign="top">72.7</td><td align="left" valign="top">0.800</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td><td align="left" valign="top">71.4</td><td align="left" valign="top">71.4</td><td align="left" valign="top">0.714</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td><td align="left" valign="top">77.8</td><td align="left" valign="top">84.0</td><td align="left" valign="top">0.808</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>TX</td><td align="left" valign="top">85.7</td><td align="left" valign="top">85.7</td><td align="left" valign="top">0.857</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">83.9</td><td align="left" valign="top">84.5</td><td align="left" valign="top">0.839</td></tr><tr><td align="left" valign="top">GLM-4-Air (black-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a</td><td align="left" valign="top">80.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.889</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1b</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1c</td><td align="left" valign="top">84.6</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.917</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td><td align="left" valign="top">100.0</td><td align="left" valign="top">75.0</td><td align="left" valign="top">0.857</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2a</td><td align="left" valign="top">100.0</td><td align="left" valign="top">91.7</td><td align="left" valign="top">0.957</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2b</td><td align="left" valign="top">91.7</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.956</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td><td align="left" valign="top">85.7</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.828</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td><td align="left" valign="top">91.7</td><td align="left" valign="top">95.6</td><td align="left" valign="top">0.936</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tx</td><td align="left" valign="top">100.0</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.889</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">92.6</td><td align="left" valign="top">91.4</td><td align="left" valign="top">0.914</td></tr><tr><td align="left" valign="top">GPT-4o (black-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a</td><td align="left" valign="top">60.0</td><td align="left" valign="top">75.0</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1b</td><td align="left" valign="top">90.0</td><td align="left" valign="top">90.0</td><td align="left" valign="top">0.900</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1c</td><td align="left" valign="top">90.9</td><td align="left" valign="top">90.9</td><td align="left" valign="top">0.909</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td><td align="left" valign="top">42.9</td><td align="left" valign="top">75.0</td><td align="left" valign="top">0.546</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2a</td><td align="left" valign="top">92.3</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.960</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2b</td><td align="left" valign="top">100.0</td><td align="left" valign="top">90.9</td><td align="left" valign="top">0.952</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td><td align="left" valign="top">86.7</td><td align="left" valign="top">86.7</td><td align="left" valign="top">0.867</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td><td align="left" valign="top">95.0</td><td align="left" valign="top">82.6</td><td align="left" valign="top">0.884</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tx</td><td align="left" valign="top">88.9</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.842</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">83.0</td><td align="left" valign="top">85.7</td><td align="left" valign="top">0.836</td></tr></tbody></table></table-wrap><table-wrap id="t10" position="float"><label>Table 10.</label><caption><p>Comparison of precision, recall, and <italic>F</italic><sub>1</sub>-score for regional lymph node involvement (N) staging between GLM-4-Air (general language model) and GPT-4o.</p></caption><table id="table10" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and N staging</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">GLM-4-Air (white-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td><td align="left" valign="top">93.5</td><td align="left" valign="top">95.6</td><td align="left" valign="top">0.945</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td><td align="left" valign="top">100.0</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.889</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td><td align="left" valign="top">94.1</td><td align="left" valign="top">97.0</td><td align="left" valign="top">0.955</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N3</td><td align="left" valign="top">100.0</td><td align="left" valign="top">81.2</td><td align="left" valign="top">0.897</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nx</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">77.5</td><td align="left" valign="top">70.8</td><td align="left" valign="top">0.737</td></tr><tr><td align="left" valign="top">GPT-4o (white-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">66.7</td><td align="left" valign="top">0.800</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td><td align="left" valign="top">66.7</td><td align="left" valign="top">40.0</td><td align="left" valign="top">0.500</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td><td align="left" valign="top">82.0</td><td align="left" valign="top">97.0</td><td align="left" valign="top">0.889</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N3</td><td align="left" valign="top">93.8</td><td align="left" valign="top">93.8</td><td align="left" valign="top">0.938</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nx</td><td align="left" valign="top">8.3</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.154</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">70.2</td><td align="left" valign="top">79.5</td><td align="left" valign="top">0.656</td></tr><tr><td align="left" valign="top">GLM-4-Air (black-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td><td align="left" valign="top">89.4</td><td align="left" valign="top">97.7</td><td align="left" valign="top">0.933</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td><td align="left" valign="top">100.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">1.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td><td align="left" valign="top">81.6</td><td align="left" valign="top">91.2</td><td align="left" valign="top">0.861</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N3</td><td align="left" valign="top">100.0</td><td align="left" valign="top">44.4</td><td align="left" valign="top">0.615</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nx</td><td align="left" valign="top">50.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">84.2</td><td align="left" valign="top">86.7</td><td align="left" valign="top">0.815</td></tr><tr><td align="left" valign="top">GPT-4o (black-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">58.1</td><td align="left" valign="top">0.735</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td><td align="left" valign="top">66.7</td><td align="left" valign="top">66.7</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td><td align="left" valign="top">67.4</td><td align="left" valign="top">91.2</td><td align="left" valign="top">0.775</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N3</td><td align="left" valign="top">90.9</td><td align="left" valign="top">55.6</td><td align="left" valign="top">0.690</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nx</td><td align="left" valign="top">13.3</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.235</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">67.7</td><td align="left" valign="top">74.3</td><td align="left" valign="top">0.620</td></tr></tbody></table></table-wrap><table-wrap id="t11" position="float"><label>Table 11.</label><caption><p>Comparison of precision, recall, and <italic>F</italic><sub>1</sub>-score for distant metastasis status (M) staging between GLM-4-Air (general language model) and GPT-4o.</p></caption><table id="table11" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and M staging</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">GLM-4-Air (white-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">94.6</td><td align="left" valign="top">0.972</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1a</td><td align="left" valign="top">71.4</td><td align="left" valign="top">55.6</td><td align="left" valign="top">0.625</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1b</td><td align="left" valign="top">57.1</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1c</td><td align="left" valign="top">96.2</td><td align="left" valign="top">92.6</td><td align="left" valign="top">0.943</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mx</td><td align="left" valign="top">50.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">74.9</td><td align="left" valign="top">84.5</td><td align="left" valign="top">0.775</td></tr><tr><td align="left" valign="top">GPT-4o (white-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">61.8</td><td align="left" valign="top">0.764</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1a</td><td align="left" valign="top">50.0</td><td align="left" valign="top">30.0</td><td align="left" valign="top">0.375</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1b</td><td align="left" valign="top">23.1</td><td align="left" valign="top">75.0</td><td align="left" valign="top">0.353</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1c</td><td align="left" valign="top">86.7</td><td align="left" valign="top">96.3</td><td align="left" valign="top">0.912</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mx</td><td align="left" valign="top">23.5</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.381</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">56.7</td><td align="left" valign="top">72.6</td><td align="left" valign="top">0.557</td></tr><tr><td align="left" valign="top">GLM-4-Air (black-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M0</td><td align="left" valign="top">98.4</td><td align="left" valign="top">95.3</td><td align="left" valign="top">0.968</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1a</td><td align="left" valign="top">100.0</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.889</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1b</td><td align="left" valign="top">57.1</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.727</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1c</td><td align="left" valign="top">94.7</td><td align="left" valign="top">94.7</td><td align="left" valign="top">0.947</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mx</td><td align="left" valign="top">62.5</td><td align="left" valign="top">62.5</td><td align="left" valign="top">0.625</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">82.6</td><td align="left" valign="top">86.5</td><td align="left" valign="top">0.831</td></tr><tr><td align="left" valign="top">GPT-4o (black-box)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">73.0</td><td align="left" valign="top">0.844</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1a</td><td align="left" valign="top">83.3</td><td align="left" valign="top">83.3</td><td align="left" valign="top">0.833</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1b</td><td align="left" valign="top">40.0</td><td align="left" valign="top">50.0</td><td align="left" valign="top">0.444</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>M1c</td><td align="left" valign="top">85.7</td><td align="left" valign="top">94.7</td><td align="left" valign="top">0.900</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mx</td><td align="left" valign="top">31.8</td><td align="left" valign="top">87.5</td><td align="left" valign="top">0.467</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-averaging</td><td align="left" valign="top">68.2</td><td align="left" valign="top">77.7</td><td align="left" valign="top">0.698</td></tr></tbody></table></table-wrap><fig position="float" id="figure14"><label>Figure 14.</label><caption><p>Comparison of precision and recall for T staging between GLM-4-Air and GPT-4o in the white-box and black-box tests. GLM: general language model; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig14.png"/></fig><fig position="float" id="figure15"><label>Figure 15.</label><caption><p>The <italic>F</italic><sub>1</sub>-score of T staging of GLM-4-Air and GPT-4o in the white-box and black-box tests. GLM: general language model; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig15.png"/></fig><fig position="float" id="figure16"><label>Figure 16.</label><caption><p>Comparison of precision and recall for N staging between GLM-4-Air and GPT-4o in the white-box and black-box tests. GLM: general language model; N: regional lymph node involvement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig16.png"/></fig><fig position="float" id="figure17"><label>Figure 17.</label><caption><p>The <italic>F</italic><sub>1</sub>-score of N staging of GLM-4-Air and GPT-4o in the white-box and black-box tests. GLM: general language model; N: regional lymph node involvement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig17.png"/></fig><fig position="float" id="figure18"><label>Figure 18.</label><caption><p>Comparison of precision and recall for M staging between GLM-4-Air and GPT-4o in the white-box and black-box tests. GLM: general language model; M: distant metastasis status.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig18.png"/></fig><fig position="float" id="figure19"><label>Figure 19.</label><caption><p>The <italic>F</italic><sub>1</sub>-score of M staging of GLM-4-Air and GPT-4o in the white-box and black-box tests. GLM: general language model; M: distant metastasis status.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig19.png"/></fig><p>The results for each T, N, and M stage will be elaborated separately in this section, with corresponding discussions provided in the Discussion section.</p><list list-type="bullet"><list-item><p>White-box Evaluation (T): In the T1a subcategory, GLM-4-Air demonstrated precision and recall of 75.0% and 100.0%, respectively, showing slightly lower precision but equivalent recall compared with GPT-4o (100.0% and 100.0%). For T1b staging, GLM-4-Air achieved a precision of 100.0% and a recall of 90.0%, outperforming GPT-4o (90.0% for both metrics) in precision. In the T1c subcategory, GLM-4-Air demonstrated precision and recall of 86.7% and 92.9%, respectively, underperforming GPT-4o (both 100.0%). In T2 staging, GLM-4-Air attained 100.0% of both precision and recall versus GPT-4o (75.0% and 100.0%). For T2b, GLM-4-Air maintained performance of 100.0% for both precision and recall, compared with GPT-4o (88.9 % precision and 72.7% recall). In T3 staging, GLM-4-Air showed precision of 83.3% and recall of 71.4%, surpassing GPT-4o in precision (71.4%) while matching recall. For T4 staging, GLM-4-Air achieved 92.0% for both metrics, exceeding GPT-4o (77.8% precision and 84.0% recall).</p></list-item><list-item><p>Black-box Evaluation (T): In T1a staging, GLM-4-Air attained 80.0% precision and 100.0% recall, outperforming GPT-4o (60.0% and 75.0%). For T1b, GLM-4-Air achieved 100.0% of both precision and recall compared with GPT-4o&#x2019;s both 90.0%. In T1c staging, GLM-4-Air showed 84.6% precision and 100.0% recall, while GPT-4o achieved 90.9% for both metrics. For T2a, GLM-4-Air demonstrated 100.0% precision and 91.7% recall versus GPT-4o&#x2019;s 92.3% and 100.0%, with each model excelling in different metrics. In T3 staging, GLM-4-Air showed 85.7% precision and 80.0% recall, slightly lower than GPT-4o (86.7% both). For T4 staging, GLM-4-Air achieved 91.7% precision and 95.6% recall compared with GPT-4o&#x2019;s 95.0% and 82.6%.</p></list-item><list-item><p><italic>F</italic><sub>1</sub>-score Analysis (T): In white-box testing, GLM-4-Air showed slightly lower <italic>F</italic><sub>1</sub>-scores in T1a and T1c (0.857 and 0.897) compared with GPT-4o (1.000 in both). Both models achieved identical <italic>F</italic><sub>1</sub>-scores (0.857) in Tx staging. However, GLM-4-Air demonstrated higher <italic>F</italic><sub>1</sub>-scores in all other T subcategories. In black-box testing, GLM-4-Air showed marginally lower <italic>F</italic><sub>1</sub>-scores in T2a and T3 (0.956 and 0.828) compared with GPT-4o (0.960 and 0.867), but outperformed GPT-4o in all remaining subcategories. Notably, GLM-4-Air demonstrated more balanced and stable performance across all T subcategories. In black-box testing, GLM-4-Air maintained <italic>F</italic><sub>1</sub>-scores above 0.828 across all T subcategories, while GPT-4o showed greater variability, particularly in T2 staging, where its <italic>F</italic><sub>1</sub>-score dropped to 0.546.</p></list-item><list-item><p>Macro-average Metrics (T): In white-box testing, GLM-4-Air achieved macro-average precision, recall, and <italic>F</italic><sub>1</sub>-score of 91.4%, 88.2%, and 0.887, respectively, compared with GPT-4o&#x2019;s 83.9%, 84.5%, and 0.839. In black-box testing, GLM-4-Air demonstrated further improvement with macro-average metrics of 92.6%, 91.4%, and 0.914, consistently exceeding GPT-4o&#x2019;s corresponding values of 83.0%, 85.7%, and 0.836.</p></list-item><list-item><p>White-box Evaluation (N): In the N0 subcategory, GLM-4-Air demonstrated a slightly lower precision than GPT-4o (93.48% vs 100.0%) but achieved a substantially higher recall (95.6% vs 66.7%). For N1 staging, GLM-4-Air attained a precision of 100.0% and a recall of 80.0%, outperforming GPT-4o (66.7% precision and 40.0% recall). In N2 staging, GLM-4-Air showed higher precision (94.1% vs 82.0%) while matching GPT-4o&#x2019;s recall (97.0% for both models). For N3 staging, GLM-4-Air achieved precision of 100.0%, surpassing GPT-4o (93.8%).</p></list-item><list-item><p>Black-box Evaluation (N): In the N0 subcategory, GLM-4-Air again showed lower precision than GPT-4o (89.4% vs 100.0%) but demonstrated significantly higher recall (97.7% vs 58.1%). For N1 staging, GLM-4-Air achieved scores of 100.0% for both precision and recall, while GPT-4o achieved 66.7% for both metrics. In N2 staging, GLM-4-Air showed higher precision (81.6% vs 67.4%) with both models achieving identical recall (91.2%). For N3 staging, GLM-4-Air attained precision of 100.0% compared with 90.9% of GPT-4o, though with a slightly lower recall (44.4% vs 55.6%).</p></list-item><list-item><p><italic>F</italic><sub>1</sub>-score Analysis (N): In white-box testing, GLM-4-Air&#x2019;s <italic>F</italic><sub>1</sub>-score for the Nx subcategory was 0 due to it misclassifying the only Nx case in white-box; excluding this, GLM-4-Air achieved higher <italic>F</italic><sub>1</sub>-scores than GPT-4o in the N0, N1, and N2 subcategories. However, in the N3 subcategory, GLM-4-Air&#x2019;s <italic>F</italic><sub>1</sub>-score (0.897) was slightly lower than GPT-4o&#x2019;s (0.938). In black-box testing, GLM-4-Air demonstrated higher <italic>F</italic><sub>1</sub>-scores than GPT-4o in the N0, N1, N2, and Nx subcategories. Although its <italic>F</italic><sub>1</sub>-score in the N3 subcategory (0.615) was lower than GPT-4o&#x2019;s (0.690), GLM-4-Air maintained <italic>F</italic><sub>1</sub>-scores above 0.615 across all N subcategories, while GPT-4o&#x2019;s minimum <italic>F</italic><sub>1</sub>-score was 0.235 (in Nx subcategory).</p></list-item><list-item><p>Macro-average Metrics (N): In white-box testing, GLM-4-Air achieved macro-average precision, recall, and <italic>F</italic><sub>1</sub>-score of 77.5%, 70.8%, and 0.737, respectively, compared with GPT-4o&#x2019;s 70.2%, 79.5%, and 0.656. In black-box testing, GLM-4-Air&#x2019;s macro-average metrics were 84.2%, 86.7%, and 0.815, consistently outperforming GPT-4o&#x2019;s corresponding values of 67.7%, 74.3%, and 0.620.</p></list-item><list-item><p>White-box evaluation (M): In the white-box test set, the two models demonstrated identical precision (100.0%) only in the M0 subcategory. Across all other subcategories (M1a, M1b, M1c, Mx), GLM-4-Air consistently achieved higher precision than GPT-4o. Regarding recall, both models performed equally (100.0%) in the Mx subcategory. GLM-4-Air exhibited superior recall in the M0, M1a, and M1b subcategories compared with GPT-4o, though it showed a slightly lower recall in M1c (92.6% vs 96.3%).</p></list-item><list-item><p>Black-box Evaluation (M): In the black-box test set, GLM-4-Air showed slightly lower precision than GPT-4o in the M0 subcategory (98.4% vs 100.0%). However, it outperformed GPT-4o in all remaining subcategories (M1a, M1b, M1c, Mx) in terms of precision. For recall, each model showed advantages in different areas: GLM-4-Air achieved higher recall in M0 and M1b, with a recall of 100.0% in M1b, which significantly surpassed GPT-4o (50.0%). Conversely, GLM-4-Air&#x2019;s recall was marginally lower in M1a and Mx, while both models achieved identical recall in M1c.</p></list-item><list-item><p><italic>F</italic><sub>1</sub>-score Analysis (M): In white-box testing, GLM-4-Air achieved higher <italic>F</italic><sub>1</sub>-scores than GPT-4o across all M subcategories (M0, M1a, M1b, M1c, Mx). This pattern was consistent in black-box testing, where GLM-4-Air again demonstrated superior <italic>F</italic><sub>1</sub>-scores in each subcategory. Additionally, GLM-4-Air maintained <italic>F</italic><sub>1</sub>-scores above 0.625 across all M subcategories in both white-box and black-box settings, whereas GPT-4o&#x2019;s lowest <italic>F</italic><sub>1</sub>-scores were observed in M1b (0.353 in white-box and 0.444 in black-box).</p></list-item><list-item><p>Macro-average Metrics (M): In white-box testing, GLM-4-Air achieved macro-average precision, recall, and <italic>F</italic><sub>1</sub>-score of 74.9%, 84.5%, and 0.775, respectively, comprehensively outperforming GPT-4o (56.7%, 72.6%, and 0.557). Similarly, in black-box testing, GLM-4-Air&#x2019;s macro-average metrics (82.6%, 86.5%, and 0.831) were significantly higher than those of GPT-4o (68.2%, 77.7%, and 0.698).</p></list-item></list><p>To evaluate the improvements GLM-4-Air achieved via SFT, we also compared the <italic>F</italic><sub>1</sub>-scores for each staging subcategory before and after SFT in the white-box test set. Since the baseline model was ultimately selected for M staging, the comparison focused specifically on the T and N subcategories. The results are presented in <xref ref-type="table" rid="table12">Table 12</xref> below. Combining with the analysis of the post-SFT white-box results for T and N staging (<xref ref-type="table" rid="table9">Tables 9</xref> and <xref ref-type="table" rid="table10">10</xref>), it can be seen that GLM-4-Air demonstrated marked overall improvement in both T and N staging tasks following SFT.</p><p>In addition, to provide a more intuitive demonstration of the performance improvement in GLM-4-Air after SFT, we calculated the difference in precision, recall, and <italic>F</italic><sub>1</sub>-scores for each T and N staging subcategory following SFT. The difference in the <italic>F</italic><sub>1</sub>-score of T and N staging was illustrated in <xref ref-type="fig" rid="figure20">Figure 20</xref>.</p><table-wrap id="t12" position="float"><label>Table 12.</label><caption><p>Precision, recall, and <italic>F</italic><sub>1</sub>-score for primary tumor characteristics (T) and regional lymph node involvement (N) staging of original (presupervised fine-tuning) GLM-4-Air (general language model).</p></caption><table id="table12" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and staging</td><td align="left" valign="bottom">Precision (%)</td><td align="left" valign="bottom">Recall (%)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">GLM-4-Air (Original)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">50.0</td><td align="left" valign="top">0.667</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1a</td><td align="left" valign="top">60.0</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.750</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1b</td><td align="left" valign="top">66.7</td><td align="left" valign="top">60.0</td><td align="left" valign="top">0.632</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T1c</td><td align="left" valign="top">66.7</td><td align="left" valign="top">42.9</td><td align="left" valign="top">0.522</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2</td><td align="left" valign="top">27.3</td><td align="left" valign="top">100.0</td><td align="left" valign="top">0.429</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2a</td><td align="left" valign="top">100.0</td><td align="left" valign="top">45.4</td><td align="left" valign="top">0.625</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T2b</td><td align="left" valign="top">100.0</td><td align="left" valign="top">54.6</td><td align="left" valign="top">0.706</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T3</td><td align="left" valign="top">30.0</td><td align="left" valign="top">42.9</td><td align="left" valign="top">0.353</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>T4</td><td align="left" valign="top">73.3</td><td align="left" valign="top">88.0</td><td align="left" valign="top">0.800</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tx</td><td align="left" valign="top">71.4</td><td align="left" valign="top">71.4</td><td align="left" valign="top">0.714</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-Averaging</td><td align="left" valign="top">69.5</td><td align="left" valign="top">65.5</td><td align="left" valign="top">0.620</td></tr><tr><td align="left" valign="top">GLM-4-Air (Original)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N0</td><td align="left" valign="top">97.7</td><td align="left" valign="top">93.3</td><td align="left" valign="top">0.954</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td><td align="left" valign="top">80.0</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.800</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td><td align="left" valign="top">85.7</td><td align="left" valign="top">90.9</td><td align="left" valign="top">0.882</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N3</td><td align="left" valign="top">92.9</td><td align="left" valign="top">81.2</td><td align="left" valign="top">0.867</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nx</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-Averaging</td><td align="left" valign="top">71.2</td><td align="left" valign="top">69.1</td><td align="left" valign="top">0.701</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N1</td><td align="left" valign="top">100.0</td><td align="left" valign="top">80.0</td><td align="left" valign="top">0.889</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N2</td><td align="left" valign="top">94.1</td><td align="left" valign="top">97.0</td><td align="left" valign="top">0.955</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>N3</td><td align="left" valign="top">100.0</td><td align="left" valign="top">81.2</td><td align="left" valign="top">0.897</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nx</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.000</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Macro-Averaging</td><td align="left" valign="top">77.5</td><td align="left" valign="top">70.8</td><td align="left" valign="top">0.737</td></tr></tbody></table></table-wrap><fig position="float" id="figure20"><label>Figure 20.</label><caption><p>Changes in the <italic>F</italic><sub>1</sub>-score for T and N staging of GLM-4-Air in the white-box test pre-SFT and post-SFT. GLM: general language model; N: regional lymph node involvement; SFT: supervised fine-tuning; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig20.png"/></fig></sec><sec id="s3-4-2"><title>Comparison of T Staging Performance Before and After SFT</title><p>For the T0 subcategory, precision, recall, and <italic>F</italic><sub>1</sub>-score remained unchanged. In T1a staging, precision improved from 60.0% to 75.0%, while recall remained at 100.0%, resulting in an increase in <italic>F</italic><sub>1</sub>-score from 0.750 to 0.857. T1b staging showed substantial improvement: precision increased from 66.7% to 100.0%, recall from 60.0% to 90.0%, and <italic>F</italic><sub>1</sub>-score from 0.632 to 0.947. The notable improvement was observed in T1c staging: precision increased from 66.7% to 86.7%, recall from 42.9% to 92.9%, and <italic>F</italic><sub>1</sub>-score from 0.522 to 0.897. In T2 staging, precision improved markedly from 27.3% to 100.0%, and <italic>F</italic><sub>1</sub>-score increased from 0.429 to 1.000. For T2a staging, precision slightly decreased from 100.0% to 91.7%, but recall substantially increased from 45.4% to 100.0%, raising the <italic>F</italic><sub>1</sub>-score from 0.625 to 0.956. T2b staging maintained precision of 100.0% while recall increased from below 60.0% to 100.0%, improving the <italic>F</italic><sub>1</sub>-score from 0.706 to 1.000. In T3 staging, precision, recall, and <italic>F</italic><sub>1</sub>-score improved from 30.0%, 42.9%, and 0.353 to 83.3%, 71.4%, and 0.769, respectively. For T4 staging, precision increased from 73.3% to 92.0%, recall from 88.0% to 92.0%, and <italic>F</italic><sub>1</sub>-score from 0.800 to 0.920. In Tx staging, precision, recall, and <italic>F</italic><sub>1</sub>-score all improved from 71.4%, 71.4%, and 0.714 to 85.7%, 85.7%, and 0.857, respectively.</p><p>Macro-average metrics demonstrated substantial improvement after SFT. Specifically, precision increased from 69.5% to 91.4%, recall from 65.5% to 88.2%, and <italic>F</italic><sub>1</sub>-score from 0.620 to 0.887.</p></sec><sec id="s3-4-3"><title>Comparison of N Staging Performance Before and After SFT</title><p>For N0 staging, precision slightly decreased from 97.7% to 93.5% after SFT, but recall improved from 93.3% to 95.6%, maintaining a high <italic>F</italic><sub>1</sub>-score of 0.945 (compared with 0.954 pre-SFT). The improvement was observed in N1 staging: precision increased from 80.0% to 100.0%, recall remained at 80.0%, and <italic>F</italic><sub>1</sub>-score improved from 0.800 to 0.889. In N2 staging, precision increased from 85.7% to 94.1%, recall from 90.9% to 97.0%, and <italic>F</italic><sub>1</sub>-score from 0.882 to 0.955. For N3 staging, precision improved from 92.9% to 100.0%, recall remained at 81.2%, and <italic>F</italic><sub>1</sub>-score increased from 0.867 to 0.897. Macro-average metrics showed improvement after SFT: precision increased from 71.2% to 77.5%, recall from 69.1% to 70.8%, and <italic>F</italic><sub>1</sub>-score from 0.701 to 0.737.</p></sec><sec id="s3-4-4"><title>Statistical Comparison</title><p>The results of the paired significance tests (McNemar&#x2019;s test) for model comparisons are summarized in <xref ref-type="table" rid="table13">Table 13</xref> below.</p><table-wrap id="t13" position="float"><label>Table 13.</label><caption><p>Results of the McNemar&#x2019;s test for model comparisons between GLM-4-Air (general language model) and GPT-4o.</p></caption><table id="table13" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameters</td><td align="left" valign="bottom">White-box T<sup><xref ref-type="table-fn" rid="table13fn1">a</xref></sup></td><td align="left" valign="bottom">Black-box T</td><td align="left" valign="bottom">White-box N<sup><xref ref-type="table-fn" rid="table13fn2">b</xref></sup></td><td align="left" valign="bottom">Black-box N</td><td align="left" valign="bottom">White-box M<sup><xref ref-type="table-fn" rid="table13fn3">c</xref></sup></td><td align="left" valign="bottom">Black-box M</td></tr></thead><tbody><tr><td align="left" valign="top">Chi-square test (<italic>df</italic>)</td><td align="left" valign="top">14 (45)</td><td align="left" valign="top">14.3 (36)</td><td align="left" valign="top">23 (10)</td><td align="left" valign="top">24.5 (10)</td><td align="left" valign="top">21 (10)</td><td align="left" valign="top">15.9 (10)</td></tr><tr><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">.01</td><td align="left" valign="top">.006</td><td align="left" valign="top">.02</td><td align="left" valign="top">.10</td></tr><tr><td align="left" valign="top">Significance</td><td align="left" valign="top">Not significant</td><td align="left" valign="top">Not significant</td><td align="left" valign="top">Significant</td><td align="left" valign="top">Significant</td><td align="left" valign="top">Significant</td><td align="left" valign="top">Not significant</td></tr><tr><td align="left" valign="top">Dataset (n)</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td></tr><tr><td align="left" valign="top">Staging</td><td align="left" valign="top">T0, T1a, T1b, T1c, T2, T2a, T2b, T3, T4, Tx</td><td align="left" valign="top">T1a, T1b, T1c, T2, T2a, T2b, T3, T4, Tx</td><td align="left" valign="top">N0, N1, N2, N3, Nx</td><td align="left" valign="top">N0, N1, N2, N3, Nx</td><td align="left" valign="top">M0, M1a, M1b, M1c, Mx</td><td align="left" valign="top">M0, M1a, M1b, M1c, Mx</td></tr><tr><td align="left" valign="top">&#x03C9;</td><td align="left" valign="top">0.374</td><td align="left" valign="top">0.378</td><td align="left" valign="top">0.480</td><td align="left" valign="top">0.495</td><td align="left" valign="top">0.458</td><td align="left" valign="top">0.399</td></tr></tbody></table><table-wrap-foot><fn id="table13fn1"><p><sup>a</sup>T: primary tumor characteristics.</p></fn><fn id="table13fn2"><p><sup>b</sup>N: regional lymph node involvement.</p></fn><fn id="table13fn3"><p><sup>c</sup>M: distant metastasis status.</p></fn></table-wrap-foot></table-wrap><p>For T staging, the differences were not statistically significant in both the black-box (<italic>&#x03C7;</italic>&#x00B2;<sub>36</sub>=14.333, <italic>P</italic>=.99) and white-box (<italic>&#x03C7;</italic>&#x00B2;<sub>45</sub>=14.000, <italic>P</italic>=.99) test sets.</p><p>In contrast, for N staging, statistically significant differences were observed in both the black-box (<italic>&#x03C7;</italic>&#x00B2;<sub>10</sub>=24.533, <italic>P</italic>=.006) and white-box (<italic>&#x03C7;</italic>&#x00B2;<sub>10</sub>=23.000, <italic>P</italic>=.01) test sets.</p><p>For M staging, the results differed between the test sets: a statistically significant difference was found in the white-box set (<italic>&#x03C7;</italic>&#x00B2;<sub>10</sub>=20.974, <italic>P</italic>=.02), but not in the black-box set (<italic>&#x03C7;</italic>&#x00B2;<sub>10</sub>=15.941, <italic>P</italic>=.10). It is worth noting that although the black-box model for M staging did not reach the statistical significance level, its chi-square test effect size reached <italic>&#x03C9;</italic>=0.399, which is considered a medium-to-large effect.</p></sec><sec id="s3-4-5"><title>Clinical Impact Assessment</title><p>Based on the previously defined error typology (<xref ref-type="table" rid="table4">Table 4</xref>), we systematically categorized all errors committed by both models in the white-box and black-box test sets. The detailed statistical results are presented in <xref ref-type="table" rid="table14">Table 14</xref> and visualized in <xref ref-type="fig" rid="figure21">Figure 21</xref>.</p><table-wrap id="t14" position="float"><label>Table 14.</label><caption><p>Distribution of error categories across primary tumor characteristics (T), regional lymph node involvement (N), and distant metastasis status (M) staging on White-box and Black-box of GLM-4-Air (general language model) and GPT-4o.</p></caption><table id="table14" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model/Data Set/Stage</td><td align="left" valign="bottom">I-Major errors</td><td align="left" valign="bottom">II-Moderate errors</td><td align="left" valign="bottom">III-Minor errors</td><td align="left" valign="bottom">Total</td></tr></thead><tbody><tr><td align="left" valign="top">GLM-4-Air-White-T</td><td align="left" valign="top">1</td><td align="left" valign="top">3</td><td align="left" valign="top">5</td><td align="left" valign="top">9</td></tr><tr><td align="left" valign="top">GPT-4o-White-T</td><td align="left" valign="top">6</td><td align="left" valign="top">4</td><td align="left" valign="top">4</td><td align="left" valign="top">14</td></tr><tr><td align="left" valign="top">GLM-4-Air-Black-T</td><td align="left" valign="top">0</td><td align="left" valign="top">6</td><td align="left" valign="top">2</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">GPT-4o-Black-T</td><td align="left" valign="top">4</td><td align="left" valign="top">5</td><td align="left" valign="top">4</td><td align="left" valign="top">13</td></tr><tr><td align="left" valign="top">GLM-4-Air-White-N</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">6</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">GPT-4o-White-N</td><td align="left" valign="top">4</td><td align="left" valign="top">4</td><td align="left" valign="top">12</td><td align="left" valign="top">20</td></tr><tr><td align="left" valign="top">GLM-4-Air-Black-N</td><td align="left" valign="top">6</td><td align="left" valign="top">0</td><td align="left" valign="top">8</td><td align="left" valign="top">14</td></tr><tr><td align="left" valign="top">GPT-4o-Black-N</td><td align="left" valign="top">7</td><td align="left" valign="top">2</td><td align="left" valign="top">21</td><td align="left" valign="top">30</td></tr><tr><td align="left" valign="top">GLM-4-Air-White-M</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">10</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">GPT-4o-White-M</td><td align="left" valign="top">8</td><td align="left" valign="top">0</td><td align="left" valign="top">22</td><td align="left" valign="top">30</td></tr><tr><td align="left" valign="top">GLM-4-Air-Black-M</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">8</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">GPT-4o-Black-M</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td><td align="left" valign="top">20</td><td align="left" valign="top">22</td></tr></tbody></table></table-wrap><fig position="float" id="figure21"><label>Figure 21.</label><caption><p>Distribution of error categories across T, N, and M staging on white-box and black-box of GLM-4-Air and GPT-4o. GLM: general language model; M: distant metastasis status; N: regional lymph node involvement; T: primary tumor characteristics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig21.png"/></fig><p>In the white-box evaluation, GLM-4-Air demonstrated superior performance with fewer total errors compared with GPT-4o across all staging categories. Specifically, for T staging, GLM-4-Air committed 9 errors versus GPT-4o&#x2019;s 14; for N staging, 8 errors versus 20; and for M staging, 10 errors versus 30.</p><p>This advantage remained consistent in the black-box evaluation. GLM-4-Air recorded 8, 14, and 8 errors in T, N, and M staging, respectively, while GPT-4o committed 13, 30, and 22 errors in the same categories.</p><p>Regarding error type classification, GLM-4-Air showed particularly notable performance in controlling category I errors. In T staging, GLM-4-Air committed only 1 category I error in white-box testing compared with GPT-4o&#x2019;s 6, and did not commit such errors at all in black-box testing versus GPT-4o&#x2019;s 4. For N staging, GLM-4-Air recorded 1 category I error in white-box (vs 4 for GPT-4o) and 6 in black-box (vs 7 for GPT-4o). In M staging, GLM-4-Air achieved 0 category I errors in both test sets, while GPT-4o committed 8 and 2 in white-box and black-box, respectively.</p><p>GLM-4-Air also demonstrated excellent performance in controlling category II errors. In T staging, it committed 3 category II errors in white-box (vs 4 for GPT-4o) and 6 in black-box (vs 5 for GPT-4o). For N staging, GLM-4-Air committed only 1 category II error in white-box (vs 4 for GPT-4o) and did not commit a category II error in black-box testing (vs 2 for GPT-4o). Neither model committed category II errors in M staging.</p><p>In category III errors, GLM-4-Air maintained an overall advantage. For T staging, it committed 5 category III errors in white-box (vs 4 for GPT-4o) and 2 in black-box (vs 4 for GPT-4o). In N staging, GLM-4-Air committed 6 category III errors in white-box (vs 12 for GPT-4o) and 8 in black-box (vs 21 for GPT-4o). For M staging, GLM-4-Air committed 10 category III errors in white-box (vs 22 for GPT-4o) and 8 in black-box (vs 20 for GPT-4o).</p><p>In addition, the evaluation results of strict and lenient interpretation strategies demonstrate that the risk of increasing false negatives under the strict strategy remains low. In the white-box test set, no false negative cases occurred, with the model correctly identifying all metastatic cases. In the black-box test set, only 1 false negative was observed, where an Mx case was misclassified as M0. Consequently, the false negative rate under the strict interpretation strategy was 0.5% (1/200), as visible in the confusion matrix in <xref ref-type="fig" rid="figure10">Figures 10</xref> and <xref ref-type="fig" rid="figure12">12</xref>.</p><p>On the other hand, the lenient strategy significantly increased false positives, with cases incorrectly classified as M1 or MX instead of M0. The results, detailed in <xref ref-type="fig" rid="figure22">Figures 22</xref> and <xref ref-type="fig" rid="figure23">23</xref>, show false positive rates of 30% (30/100) in the white-box set and 23% (23/100) in the black-box set.</p><fig position="float" id="figure22"><label>Figure 22.</label><caption><p>Confusion matrix for M staging using lenient strategy by GLM-4-Air in White-box test. GLM: general language model; M: distant metastasis status.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig22.png"/></fig><fig position="float" id="figure23"><label>Figure 23.</label><caption><p>Confusion matrix for M staging using lenient strategy by GLM-4-Air in Black-box test. GLM: general language model; M: distant metastasis status.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e77988_fig23.png"/></fig></sec><sec id="s3-4-6"><title>Cost-Effective Evaluation</title><p>The local hardware benchmark confirmed the operational efficiency of the model on accessible hardware (Four NVIDIA GeForce RTX 4090, 24 GB VRAM each). The median inference latencies per case for the GLM-4-Air model were 8.510 seconds for T staging, 5.125 seconds for N staging, and 1.300 seconds for M staging on the black-box dataset.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The principal finding of this study is that a mid-scale LLM, when optimized through a systematic framework of clinical prompt engineering and SFT, can be developed into a highly effective and more practical tool in this specific clinical task, demonstrating the efficacy and potential of domain-specific adaptation. Our optimized GLM-4-Air model achieved higher accuracy in NSCLC TNM staging relative to the general-purpose GPT-4o, alongside a notable reduction in clinically critical errors and practical deployability on consumer-grade hardware with latencies acceptable for clinical workflows. These performance outcomes, particularly evident in the semantically complex N and M staging tasks, suggest that our hybrid optimization strategy can effectively enhance domain-specific reasoning and guideline adherence required for clinical interpretation. Furthermore, these results provide evidence that a specialized, moderately sized model can achieve high accuracy in complex clinical tasks, offering a complementary pathway to the scale-driven paradigm and highlighting the critical role of domain-specific optimization.</p></sec><sec id="s4-2"><title>OCR and Annotation</title><p>One of the keys to our research was the collection of high-quality, real-world data. We therefore invested considerable effort in data preparation, working closely with medical experts for rigorous validation and cleaning, which provides high-quality source material for the OCR process. We believe OCR is a mature technology now, and this study relies fundamentally on the LLM&#x2019;s capacity for deep semantic understanding and information extraction from medical report text, not on exact string matching in a traditional way. Pretrained on massive text corpora, the LLM possesses inherent and robust capabilities for tolerating and correcting common OCR-induced noise by leveraging contextual information.</p><p>Our analysis also revealed that data quality, especially annotation quality, significantly impacts model performance, with even a small proportion of mislabeled samples potentially undermining the overall results. Given our limited dataset size, maintaining annotation quality became crucial. To ensure annotation quality and consistency, we developed standardized protocols aligned with AJCC TNM staging guidelines and implemented a multi-expert review mechanism. Annotators were instructed to make decisions strictly based on the medical reports&#x2019; textual content, minimizing subjective bias.</p><p>From the results of calculating and reporting the interannotator agreement using the Cohen kappa coefficient in <xref ref-type="table" rid="table5">Table 5</xref>, we observed an enhancement in T-stage agreement within the black-box set. This improvement may be attributed to the annotation process for the white-box set itself, possibly acting as a calibration phase. Through challenging cases, the annotators likely developed a more consistent internal standard for interpreting ambiguous T-stage descriptors (eg, invasion vs abutment). This refined, shared understanding was then applied during the black-box annotation, leading to higher immediate agreement. The sustained high agreement for N and M stages across both conditions reflects the more binary nature of nodal and metastatic assessment compared with the finer gradations required for T-stage determination. These findings underscore the context-dependent nature of staging reliability and highlight the value of multi-environment validation in assessing clinical annotation consistency.</p></sec><sec id="s4-3"><title>Prompt Optimization</title><p>Building upon the high-quality dataset, we realized three key strategies in prompt optimization were important to enhance the model&#x2019;s performance: (1) structured output and CoT design, (2) independent modular architecture design, and (3) background knowledge injection.</p><p>First, structured output and CoT design were adopted to improve the readability and interpretability of the model&#x2019;s output. In each staging task, the model not only provided the final conclusion but also presented the logical reasoning path. For example, in T staging, the model might state, &#x201C;The tumor diameter exceeds 7 cm and invades the chest wall structures, thus meeting the criteria for T4 staging.&#x201D; This approach ensures transparency, facilitating manual verification and error adjustment. Collaborating with medical professionals for white-box testing allowed us to analyze prediction errors and optimize prompts effectively.</p><p>Second, we implemented an independent modular architecture. We treated T (tumor size and location), N (lymph node metastasis), and M (distant metastasis) staging as separate tasks to reduce interference between staging judgments. To be specific, earlier versions had a 35% error rate in N staging due to mixed criteria. Modularization allowed the model to concentrate on specific tasks, increasing accuracy by approximately 30% and reducing cross-interference.</p><p>For example, in a case with the description &#x201C;T9 vertebra suspected metastatic bone tumor,&#x201D; the model initially misclassified it as M1a. After prompt optimization, it correctly identified it as T4 staging. This improvement was achieved through a specialized T staging prompt that detailed specific invasion criteria. The prompt listed T4 invasion sites, including diaphragm, mediastinum, heart, major vessels, trachea, recurrent laryngeal nerve, esophagus, vertebral bodies, and carina. This guided the model to correctly extract &#x201C;T4 Invasion Sites&#x201D;: &#x201C;vertebrae.&#x201D;</p><p>The model&#x2019;s reasoning demonstrated clear compliance with the instructions: &#x201C;The report indicates T9 vertebra involvement, which aligns with T4 staging criteria. While the tumor diameter is less than 50 mm and no T3 invasion sites are present, the vertebral involvement confirms T4 staging.&#x201D;</p><p>Finally, to address the issue of misjudgment due to the lack of professional medical knowledge in the model&#x2019;s practical application, we applied a Background Knowledge Injection strategy. This strategy involved incorporating common error patterns and detailed professional criteria from medical imaging analysis.</p><p>The background knowledge serves as a reference for each decision-making process. For instance, we observed that the model tended to incorrectly interpret small lesions in Chinese, such as cysts and thickening, as evidence of distant metastasis. This led to significant errors in M staging.</p><p>To address this, we made an additional injection of specific judgment criteria and medical terminology explanations, according to supplementary rules shown in <xref ref-type="table" rid="table1">Table 1</xref>, such as:</p><list list-type="bullet"><list-item><p>Knowledge 1: When expressions like &#x201C;small nodules,&#x201D; &#x201C;low-density nodule shadows,&#x201D; and &#x201C;density shadows&#x201D; appear, they should be considered cautiously and cannot be directly inferred as metastatic lesions.</p></list-item><list-item><p>Knowledge 2: Expressions like &#x201C;metastatic,&#x201D; &#x201C;consider metastasis,&#x201D; and &#x201C;suspected metastasis&#x201D; in other distant organs can be used as evidence for metastatic lesions.</p></list-item></list><p>Additionally, we clarified the criteria for key terms such as &#x201C;tumor invasion,&#x201D; &#x201C;cancerous nodules,&#x201D; and &#x201C;lymph node metastasis.&#x201D;</p><p>The integration of professional knowledge improved the model&#x2019;s ability to distinguish between minor abnormalities and actual metastatic lesions. As a result, the misjudgment rate for minor abnormalities decreased from 29% to 6%, while M staging accuracy increased from 70% to 90%.</p><p>The prompt optimization framework central to this study, which involves establishing a reasoning baseline, extracting key information, separately addressing T/N/M components, and injecting domain knowledge, was designed as a model-agnostic and universally applicable framework. To ensure a controlled and fair comparison, the iterative prompt optimization was informed by bad-case analysis from both models, and the final prompt set was applied uniformly to all evaluated models. This approach underscores that our primary objective was to validate a generalizable optimization pathway for the clinical task, not to orchestrate a maximally tailored benchmark for any specific model.</p><p>It should be acknowledged that the performance of any LLM, including generalist models, such as GPT-4o, could potentially be further enhanced with dedicated, model-specific prompt tuning. This is an inherent consideration when comparing a domain-optimized pipeline with a general-purpose model. Our findings and the proposed framework are positioned within the context of this shared, task-centric optimization objective.</p></sec><sec id="s4-4"><title>SFT</title><p>After the prompt optimization process, white-box testing results indicated that the model still faces challenges in precise numerical calculations and prompt adherence. Especially for precise numerical comparisons, it is difficult for further prompt optimization to solve this problem due to the lack of related abilities in the original model itself. Therefore, we introduced SFT into the training framework to further enhance model capabilities.</p><p>The SFT results showed significant improvements in domain-specific tasks, especially in tumor size measurements and classification. This demonstrated that SFT was an effective strategy for enhancing the model&#x2019;s numerical computation abilities. Interestingly, we also observed significant differences in the fine-tuning results of different staging models: T staging accuracy improved markedly, while N and M staging did not. The results for M staging (<xref ref-type="fig" rid="figure7">Figure 7</xref>) show that the fine-tuned models achieved accuracies comparable to the baseline, with the highest-performing configuration matching the baseline accuracy of 90%, while others were marginally lower. We first clarify that, based on experimental review, neither label noise in the M-staging training data nor the learning rate configuration was the primary cause of this performance plateau. We attribute the observed results primarily to the inherent characteristics of the task and the interplay within our multi-task SFT framework.</p><p>First, N and M staging tasks have fewer subcategories compared with T staging. The baseline model, powered by extensive pretraining, may already be near a performance ceiling for tasks heavily reliant on key information retrieval. Consequently, the baseline accuracies for N and M staging were high (89% and 90%, respectively), leaving limited absolute room for improvement via SFT on our dataset. This is particularly relevant for M staging, a binary task (M0 vs M1) with relatively definitive criteria that depend on detecting explicit descriptions of distant metastasis.</p><p>Building on this, a more granular technical analysis reveals underlying mechanisms. Beyond the task&#x2019;s retrieval-intensive nature, the limited number of M1-positive cases in our dataset may constrain the model&#x2019;s ability to learn the subtle semantic distinctions required (eg, between M1a &#x201C;contralateral lung nodules&#x201D; and benign findings). More importantly, within our single-stage, multi-task (T, N, M) joint SFT framework, we hypothesize that conflicting optimization signals may have arisen. T and N staging, as complex tasks demanding deep semantic reorganization, likely generated stronger gradients that dominated the model&#x2019;s optimization trajectory. In contrast, M staging performance relies heavily on the precise pattern-matching capabilities inherent in the base model. The gradient updates aimed at learning complex semantic mappings for T/N tasks may have inadvertently perturbed these foundational retrieval mechanisms. When the training signal for the M task is relatively limited, this &#x201C;interference cost&#x201D; could offset any task-specific gains, resulting in a neutral net performance benefit. It is important to note that we present this &#x201C;interference cost&#x201D; as a theoretical hypothesis for the observed results, and it remains to be validated through controlled single-task experiments.</p><p>Therefore, the observed plateau in M staging primarily reflects a misalignment between the task&#x2019;s needs and the full-parameter SFT approach in a multi-task setting, rather than a model flaw. This analysis directly corroborates the rationale for our final hybrid strategy. For M staging, preserving the base model&#x2019;s strong inherent retrieval ability and guiding it precisely via prompt engineering proved to be a more robust and parameter-efficient solution than full semantic remapping via SFT. Conversely, the significant accuracy gains in T and N staging validate that SFT is highly effective for tasks that require the deep semantic comprehension and reasoning it is designed to enhance.</p></sec><sec id="s4-5"><title>Model Evaluation</title><sec id="s4-5-1"><title>Performance Metrics</title><p>We noticed the minor 3-percentage-point drop from white-box to black-box testing. Since we have introduced approximately 7 times the amount of heterogeneous data compared with the target-domain data and balanced the data distribution to ensure essentially consistent coverage and proportions of I to IV clinical stages across both test sets, we believe that the minor difference between the 93% accuracy in white-box testing and 90% accuracy in black-box testing is attributable to normal fluctuations resulting from the test dataset, rather than significant overfitting.</p><p>The analysis of 95% CIs for accuracy reveals several noteworthy findings. Following SFT, T staging demonstrated not only substantially improved accuracy but also a markedly narrowed 95% CI, indicating enhanced estimation precision and significantly improved model stability for this task. A similar pattern was observed for N staging, where the model maintained high accuracy while achieving a more stable and narrower CI after SFT. In contrast, GPT-4o exhibited comparatively wider CIs across evaluations, suggesting greater uncertainty in its predictions. Overall, the post-SFT model shows consistently narrowed CIs for both T and N staging, reflecting improved estimation precision and reduced overfitting risk. Notably, in all six comparative scenarios between GLM-4-Air and GPT-4o (encompassing both white-box and black-box evaluations across TNM components), GLM-4-Air consistently demonstrated superior 95% CIs. This pattern suggests improved estimation precision and model stability for these tasks within the evaluated data distribution, which is a desirable property for clinical applications. Particular attention should be paid to the width of the CI for N staging accuracy in the black-box set (77.9% to 91.5%). In comparison, the 95% CIs for both T staging and M staging accuracy in the same black-box set are notably narrower (85.0% to 95.9%). This contrast highlights the greater precision and stability achieved in T and M staging, while the width of the N staging CI reflects not only statistical uncertainty from the sample size but also, and perhaps more critically, the inherent diagnostic challenges specific to imaging-based nodal (N) staging. The assessment of lymph nodes often occupies a clinical &#x201C;gray zone&#x201D; between suggestive radiological findings and definitive pathological confirmation, which is a fundamental source of label variability. Additionally, the heterogeneity in nonstandardized descriptive language across different institutional reports particularly affects N staging, contributing to the observed performance range. Therefore, while the relative narrowing of the CI post-SFT confirms improved model stability, its absolute width may, in part, map the expected performance variability when the model confronts the genuine and pronounced complexity inherent to clinical N staging. This understanding underscores that the observed CI width for N staging is not solely a statistical phenomenon but also a reflection of the specific real-world complexity of that task. Consequently, the width of the CI for N staging appropriately highlights the need for future validation in larger, prospective cohorts to more precisely estimate its performance in broader populations.</p><p>Based on the analysis of Precision, Recall, and <italic>F</italic><sub>1</sub>-score for the TNM staging results, we found that for T staging, the assessment of T1a, T1b, T1c, T2a, T2b subcategories primarily relies on the accurate extraction of tumor size information from medical reports, and precise numerical comparison and classification against established staging criteria. The calculated results indicate no substantial differences between the two models in T1 (T1a, T1b, and T1c) and T2 (T2a and T2b) staging outcomes, suggesting comparable capabilities in these fundamental tasks. In T4 staging, which entails assessing involvement of critical structures such as the diaphragm, mediastinum, heart, great vessels, trachea, recurrent laryngeal nerve, esophagus, vertebral body, or carina, GLM-4-Air achieved significantly higher recall than GPT-4o in both black-box (95.6% vs 82.6%) and white-box (92.0% vs 84.0%) settings. This indicates GLM-4-Air&#x2019;s superior ability to recognize complex or implicit descriptions of invasion. Although GPT-4o showed a slight advantage in black-box precision (95.0% vs 91.7%), its white-box precision dropped sharply to 77.8%, whereas GLM-4-Air maintained 92.0%, demonstrating stronger consistency. For T3 staging, which involves judging chest wall or pericardial invasion, GPT-4o showed marginally higher recall than GLM-4-Air in the black-box set (86.7% vs 80.0%), potentially due to its more accurate and stable interpretation of relevant terminology across diverse medical reports. However, it is noteworthy that GPT-4o&#x2019;s precision in the white-box set (71.4%) was considerably lower than that of GLM-4-Air (83.3%). Based on a comprehensive analysis and supported by confusion matrix data, we observed that GPT-4o frequently misidentified invasion sites, for instance, misclassifying diaphragmatic invasion as T3-stage involvement. In contrast, GLM-4-Air demonstrated a more balanced judgment, highlighting its advantage in generating reliable clinical references. We also observed that in the white-box test, GLM-4-Air exhibited lower precision in T1a and lower precision and recall in T1c compared with GPT-4o. Although the situation improved in the black-box test, precision in T1c remained lower. Error analysis revealed that most misclassifications by GLM-4-Air involved incorrectly assigning Tx cases as T1a or T1c, often due to errors in extracting tumor diameter from multiple posttreatment measurements. GPT-4o appeared more accurate in such contexts. In response to this identified limitation, we have supplemented the staging rule to specify that tumor diameter should be based on the most recent measurement obtained following the current treatment cycle (<xref ref-type="table" rid="table1">Table 1</xref>). GLM-4-Air achieved a higher macro-average <italic>F</italic><sub>1</sub>-score than GPT-4o (black-box: 0.914 vs 0.836; white-box: 0.887 vs 0.839). This superior performance indicates a better balance between precision and recall, which effectively reduces both misclassification and underdiagnosis. This advantage is particularly evident in T4 staging. In the context of NSCLC T staging, this implies that GLM-4-Air provides more comprehensive and reliable identification of various tumor invasion patterns, thereby enhancing diagnostic accuracy. Finally, comparing GLM-4-Air&#x2019;s overall performance in T staging between white-box and black-box tests revealed no noticeable decline in metrics, indicating stable model output and an absence of overfitting.</p><p>For N Staging, in the N0 staging category, GLM-4-Air demonstrated outstanding performance: achieving a recall of 95.6% in white-box testing (vs GPT-4o&#x2019;s 66.7%) and 97.7% in black-box testing (vs GPT-4o&#x2019;s 58.1%). This advantage primarily stems from the instructional fine-tuning that incorporates precise medical knowledge: the model strictly adheres to definitive terminology such as &#x201C;lymph node metastasis&#x201D; or &#x201C;enlarged lymph node shadow&#x201D; to determine metastasis, while maintaining caution against overinterpreting nonspecific descriptions like &#x201C;small nodule&#x201D; or &#x201C;mild FDG uptake.&#x201D; In contrast, GPT-4o, relying solely on prompt engineering without strict medical logic enforcement, generated numerous false positives (33 cases), substantially reducing its recall. Accurate discrimination between N0 and N1-N3 stages holds direct clinical significance for treatment decisions, such as determining the need for regional lymph node radiotherapy or extensive lymphadenectomy (<xref ref-type="table" rid="table4">Table 4</xref> provides error categories). However, in black-box testing, recall for N3 remained suboptimal: GLM-4-Air at 55.6% and GPT-4o at 44.4%. The confusion matrices indicate that both models frequently misclassified N3 as N2 (6 cases by GLM-4-Air, 8 by GPT-4o). This stems from the highly variable and nuanced anatomical descriptions in reports (eg, &#x201C;contralateral mediastinum&#x201D; and &#x201C;contralateral hilum&#x201D;), which current LLMs struggle to interpret accurately. Future work may involve targeted training or rule-based enhancements to improve performance in such judgments. Additionally, GLM-4-Air misclassified 2 N3 cases as N0 due to its strict adherence to the principle of &#x201C;classifying as N0 in the absence of definitive metastatic evidence.&#x201D; While this leads to occasional under-classification, the trade-off is justified given the substantial improvement in N0 recall. Subsequent SFT focusing on complex anatomical descriptions may further enhance N3 staging accuracy. For the Nx staging category, which reflects clinically indeterminate cases, both models showed limited recall: GLM-4-Air and GPT-4o achieved 0% and 8.3%, respectively, in white-box testing, and 50.0% and 13.3% in black-box testing. Analysis of GPT-4o&#x2019;s outputs suggests a tendency toward conservative assessments: it classified 10 cases that should have been &#x201C;no metastasis&#x201D; as &#x201C;indeterminate (Nx)&#x201D; in both test sets, significantly lowering its Nx recall. In comparison, GLM-4-Air demonstrated a stronger inclination toward definitive classification and rarely output &#x201C;indeterminate&#x201D; or equivalent responses. Finally, in N staging overall, GLM-4-Air exhibited no noticeable performance drop from white-box to black-box testing. Moreover, its macro-average precision, recall, and <italic>F</italic><sub>1</sub>-score all improved in the black-box setting, indicating stable model generalization without signs of overfitting.</p><p>For M staging, based on the <italic>F</italic><sub>1</sub>-score, GLM-4-Air outperformed GPT-4o in both white-box and black-box evaluations. We attribute this advantage primarily to the effective integration of medical knowledge related to distant tumor metastasis through carefully designed prompts. The results indicate that the medical knowledge acquired during the model&#x2019;s pretraining phase is insufficient for accurate distant metastasis assessment in our specific clinical context, necessitating supplementary knowledge injection coupled with robust instruction-following capabilities. It is noteworthy that no fine-tuning was performed for GLM-4-Air on M staging tasks, as it already demonstrated competent medical instruction execution. Conversely, should a model exhibit inadequate instruction adherence with complex prompts, instructional fine-tuning could be considered. Overall, GLM-4-Air demonstrated superior efficiency in leveraging medical knowledge embedded in prompts. Analysis of confusion matrices revealed that GPT-4o&#x2019;s failure to strictly follow prompt instructions resulted in 38 cases of misclassification where nonmetastatic cases were incorrectly identified as metastatic or suspected metastatic. In M staging, GLM-4-Air rigorously adhered to our prompt strategy, which inevitably led to lower performance metrics in certain subcategories compared with GPT-4o. For instance, in white-box M1c staging, GLM-4-Air&#x2019;s recall (92.6%) was slightly lower than GPT-4o&#x2019;s (96.3%). Error analysis revealed that GLM-4-Air&#x2019;s misclassifications stemmed from strict compliance with the prompt rule requiring explicit metastatic evidence in reports, leading to misclassification of M1c as M1a. Under identical prompts, GPT-4o did not adhere to this rule. According to our error classification criteria (<xref ref-type="table" rid="table4">Table 4</xref>), misclassification between M1c and M1a does not constitute a severe category I error. While strict, detailed prompt strategies may reduce recall in specific scenarios, they ensure better precision. We maintain that a relatively balanced staging performance is more likely to be applicable in real-world clinical settings. Finally, a comprehensive evaluation of GLM-4-Air&#x2019;s performance in M staging across white-box and black-box tests showed no significant performance degradation. Moreover, macro-average precision, recall, and <italic>F</italic><sub>1</sub>-score all improved in the black-box setting, indicating stable model output without evidence of overfitting.</p><p>Upon analysis of the model&#x2019;s performance data from both pre- and post-SFT stages, it was observed that in T staging, substantial improvements in <italic>F</italic><sub>1</sub>-score were observed across the T1a, T1b, T1c, T2a, and T2b subcategories following SFT. This indicates enhanced model capability in numerical discrimination tasks based primarily on tumor size, reflecting SFT&#x2019;s significant effect in strengthening both fundamental data computation and rule-following abilities. For T2, T3, and T4 staging, where assessment depends not only on tumor dimensions but also on medical understanding of local invasion extent and involved anatomical structures, <italic>F</italic><sub>1</sub>-scores increased by 57.1%, 41.6%, and 12.0%, respectively, after SFT. These improvements demonstrate that SFT effectively infused relevant medical knowledge, enhancing the model&#x2019;s comprehension of complex clinical terminology and real-world medical scenarios. Notably, a slight decrease in precision was observed for T2a post-SFT, primarily due to misclassification of some T4 cases as T2a. However, this was accompanied by an 18.7% improvement in T4 precision, which is an outcome aligned with our objective of minimizing category I errors.</p><p>In N staging, post-SFT <italic>F</italic><sub>1</sub>-score improvements for N1, N2, and N3 were 8.9% (0.800 to 0.889), 7.3% (0.882 to 0.955), and 3.0% (0.867 to 0.897), respectively. Although these gains appear modest, they remain practically meaningful given the model&#x2019;s already high baseline performance in these categories, confirming that SFT effectively enhanced lymph node metastasis assessment through improved medical knowledge integration. For the N0 subcategory, precision slightly decreased (&#x2212;4.2%) after SFT, but recall increased from 93.3% to 95.6% (+2.3%), with the <italic>F</italic><sub>1</sub>-score decreasing only marginally (0.009). This shift resulted from the model&#x2019;s stricter adherence to clinical interpretation strategies post-SFT: classifying cases as positive only when definitive terminology such as &#x201C;lymph node metastasis&#x201D; or &#x201C;enlarged lymph node shadow&#x201D; was present, while maintaining caution against nonspecific descriptions like &#x201C;small nodule&#x201D; or &#x201C;mild FDG uptake,&#x201D; thereby effectively reducing false positives. Overall, the minimal <italic>F</italic><sub>1</sub>-score reduction in N0 is substantially outweighed by the collective gains in N1-N3, consistent with our expectations.</p><p>In conclusion, SFT of GLM-4-Air not only improved computational accuracy in rule-driven tasks but also significantly enhanced discriminative capability in complex medical reasoning scenarios.</p></sec><sec id="s4-5-2"><title>Statistical Comparison</title><p>In the TNM staging task, model performance exhibited distinct stage-dependent variations between white-box and black-box evaluations. For T staging, no statistically significant differences were observed between models in either test set (<italic>P</italic>&#x003E;.05). This likely stems from T staging&#x2019;s reliance on objective, quantifiable imaging features such as tumor size, which is a task with well-defined criteria and high interpretive consistency. The observed effect sizes for white-box (<italic>&#x03C9;</italic>=0.374) and black-box (<italic>&#x03C9;</italic>=0.378) evaluations both reached medium-to-large magnitudes according to conventional benchmarks. Although these effect sizes should be interpreted with caution due to the nonsignificant differences, they may nonetheless indicate exploratory trends worthy of future investigation with larger sample sizes.</p><p>In contrast, for N staging, highly significant differences (<italic>P</italic>&#x003C;.05) were consistently demonstrated across both test sets, indicating statistically meaningful performance disparities. This suggests fundamental differences in how the models interpret regional lymph node metastasis status.</p><p>For M staging, white-box evaluation showed significant differences (<italic>P</italic>&#x003C;.05). In the black-box evaluation, where the difference did not reach statistical significance (<italic>P</italic>=.10), the medium-to-large effect size (Cohen &#x03C9;=0.399) suggests a potential trend, which points to a possible performance difference in detecting distant metastasis evidence in model behavior.</p><p>In summary, model performance divergence primarily emerged in tasks requiring complex clinical reasoning (eg, lymph node or distant metastasis assessment), whereas performance converged in tasks dependent on explicit numerical criteria (eg, T staging).</p></sec><sec id="s4-5-3"><title>Clinical Impact Assessment</title><sec id="s4-5-3-1"><title>Overview</title><p>Based on the comprehensive analysis of all error types for both models across white-box and black-box evaluations, GLM-4-Air demonstrates substantial advantages in the TNM staging task, which can be summarized as the following 2 aspects.</p></sec><sec id="s4-5-3-2"><title>GLM-4-Air's Advantages in TNM Staging</title><sec id="s4-5-3-2-1"><title>Superior Overall Error Control</title><p>Across all 18 comparison metrics (3 stages / 2 test sets / 3 error categories), GLM-4-Air outperformed GPT-4o in 14 metrics, matched its performance in 2 metrics (category II errors in M staging, with 0 errors in white-box and black-box for both models), and showed marginally higher errors in only 2 metrics (category II errors in black-box T staging and category III errors in white-box T staging, with 1 additional error each). This overall advantage primarily stems from the heterogeneous data training strategy used during SFT for T and N staging, coupled with enhanced instruction adherence (strict implementation of our stringent strategy), which collectively improved the model&#x2019;s discriminative capability for medical features.</p></sec><sec id="s4-5-3-2-2"><title>Significant Effectiveness in Controlling Critical Error Types</title><p>The core strategy of this study focuses on effectively reducing category I errors. GLM-4-Air completely reduced category I errors to 0 in M staging, whereas GPT-4o committed 8 and 2 such errors in white-box and black-box tests, respectively. This discrepancy carries crucial clinical implications: misclassifying M0 (no distant metastasis) as M1 (distant metastasis present) would incorrectly upgrade the clinical stage from III to IV, consequently shifting the treatment strategy from surgery-based comprehensive therapy to primarily palliative care. Similarly, in N staging, GLM-4-Air significantly reduced category I errors involving the misclassification of N0 as N2. Such errors could lead to overdiagnosis in clinical practice, subjecting patients who do not require lymph node dissection or radiotherapy to unnecessary medical risks and financial burdens. Despite the overall superior performance, minor fluctuations were observed in GLM-4-Air&#x2019;s category II errors in black-box testing and category III errors in white-box testing. Analysis of these specific error cases revealed that most stemmed from inaccuracies in standardized comparison of tumor size data, which is a challenge also observed in GPT-4o&#x2019;s errors. Although the performance gap with GPT-4o in these aspects is minimal, we will continue to explore methods to further enhance these capabilities.</p></sec><sec id="s4-5-3-2-3"><title>Conclusions</title><p>In conclusion, while maintaining stringent control over category I errors, GLM-4-Air also preserves advantages over GPT-4o in most category II and III error metrics. This indicates that the model achieves high safety standards without compromising overall accuracy, rendering GLM-4-Air particularly suitable for practical TNM staging applications with enhanced reliability and performance.</p></sec></sec></sec></sec><sec id="s4-6"><title>Rationale for Strict M-Staging Strategy</title><p>In addition, based on the comparative analysis of strict versus lenient interpretation strategies for M staging, the strict strategy proved highly effective in minimizing false positives while not significantly increasing false negatives. In contrast, the lenient strategy resulted in a substantially higher rate of false positives.</p><p>The adoption of the strict interpretation strategy for M staging aims to minimize false positives without compromising the need to maintain a low rate of false negatives and is grounded in specific clinical imperatives. From a clinical perspective, the high false positive rate observed under the lenient strategy is unacceptable. For instance, benign findings commonly described in reports, such as cysts in the kidneys or liver in older patients, could be misclassified as M1 (Stage IV) metastases under lenient rules. This could immediately induce unnecessary patient anxiety and trigger a cascade of inappropriate clinical actions. More critically, a false-positive call at this diagnostic juncture can fundamentally misdirect the primary treatment pathway, potentially steering a patient eligible for curative-intent therapy (eg, surgery or definitive chemoradiation) toward an erroneous palliative systemic approach. Furthermore, it compels a series of costly, invasive, and burdensome confirmatory investigations (eg, advanced imaging, biopsies) without clinical merit.</p><p>This strategic choice is also aligned with the linguistic reality of Chinese radiology reports, which often use broad or cautious terminology. The strict interpretation acts as a necessary filter against this inherent ambiguity, serving as a &#x201C;localized calibration&#x201D; technique in this context. Its core function is to correct the tendency of general-purpose LLMs to over-interpret ambiguous expressions in the absence of domain-specific fine-tuning, an enhancement generally applicable within Chinese clinical practice. It is crucial to acknowledge that this calibrated approach, while effective, is not a universal reasoning logic. It could theoretically introduce a risk of false negatives in scenarios where conservatively phrased reports use descriptive terminology. This technique complements the AJCC-based rule supplements we implemented.</p><p>Therefore, while sensitivity remains important, our strategy is designed to ensure that the model&#x2019;s outputs provide highly credible and actionable decision support. A crucial caveat is that using such a strategy requires preliminary testing and calibration based on the target language or dataset. The decision to apply a strict strategy (to correct LLM over-interpretation) versus a lenient one (to correct LLM under-interpretation) should be guided by initial validation, as the latter scenario is also possible in other linguistic contexts or specific datasets. The strict strategy for M staging in this study thus represents a calibrated balance, optimizing the model for reliable integration into high-stakes clinical workflows.</p></sec><sec id="s4-7"><title>Cost-Effective Evaluation</title><p>This evaluation examines the operational efficiency and deployment feasibility of our optimized GLM-4-Air model. Through domain-specific optimization, the 32B-parameter GLM-4-Air model can achieve high staging accuracy while demonstrating a practical efficiency profile. A key practical implication of this study is the demonstration that a specialized, moderately sized language model can perform complex TNM staging with high accuracy using cost-effective, consumer-grade hardware. Specifically, our local deployment benchmark achieved low per-component latencies on 4 RTX 4090 GPUs, confirming that the model&#x2019;s performance profile is acceptable and suitable for integration into real-world clinical workflows without requiring prohibitive computational infrastructure. While this study demonstrates the cost-effectiveness of the 32B-parameter GLM-4-Air model within our hybrid framework, we acknowledge that the broader claim of hardware accessibility is not exclusive to this specific model. Other competitively performant &#x201C;lightweight&#x201D; models (eg, GPT-4o-mini, Llama-3-70B), which share a similar order-of-magnitude parameter scale, could potentially offer comparable reductions in computational resource demand and deployment cost. While the assessment of other lightweight models was beyond the scope of this study, a systematic comparison among them represents a direction for future research.</p><p>Furthermore, the Cost-effective Evaluation suggests a trend of complementary strengths between large general-purpose models and smaller, domain-adapted models in the medical TNM staging task. On one hand, GPT-4o, a general-purpose LLM, demonstrates high conciseness in tasks with shorter reasoning chains, such as N staging. Its powerful linguistic generalization capabilities enable it to rapidly extract key information. On the other hand, for complex tasks like T staging, which require in-depth parsing of anatomical relationships and multimodal clinical descriptions, GLM-4-Air, enhanced by targeted medical SFT, establishes more stable and interpretable reasoning pathways. While large-parameter models excel in cross-domain generalization and flexible response for open scenarios, smaller-parameter models like GLM-4-Air are potentially better suited for high-reliability specialized tasks. They can potentially deliver lower overall latency and higher deployment cost-effectiveness while ensuring reasoning accuracy and completeness, with a proper training framework.</p></sec><sec id="s4-8"><title>Limitations</title><p>While the performance of our model is robust on the held-out test sets, we acknowledge that the generalizability of any AI model trained on a dataset of this scale (292 fine-tuning cases) warrants careful discussion. The primary consideration is its performance on data from wider distributions, such as more institutions with variations in reporting styles. Furthermore, our dataset was sourced exclusively from the proprietary medical record-management platform reliant on user-initiated uploads. This data acquisition method may introduce potential selection biases: the user population likely possesses higher digital literacy and health awareness, potentially limiting representativeness in terms of age or socioeconomic status; the act of voluntary uploading may also over-represent patients with more complex conditions, unresolved diagnostic concerns, a willingness to seek second opinions, or experiences with multi-institutional care in the dataset. These specific characteristics suggest that the patient profile in our study may differ from that of a general hospital cohort, which would typically include a higher proportion of patients with lower digital literacy (often associated with advanced age or lower socioeconomic status) and earlier-stage disease. The model&#x2019;s staging accuracy for these specific subgroups requires further validation. Additionally, as uploaded data typically consists of single or nonconsecutive medical records, it lacks the temporal continuity and completeness in documenting disease progression compared with the structured, longitudinal records within a hospital information system. This difference in information presentation implies that the model may face challenges in inference and integration when processing richer contextual reports. Consequently, while the model demonstrates robust performance within the represented data distribution, its generalizability to passively collected, consecutive hospital-wide cohorts could be further explored in future work. Caution is advised when interpreting performance metrics for subclasses with a low number of cases. The high recall rates observed in some specific subgroups primarily reflect performance on that particular test subset and are subject to statistical variability due to limited sample size. It is also crucial to emphasize that an awareness of these potential biases directly informed our study design. To proactively enhance model robustness and generalizability within this context, we integrated 2 core methodological choices: the curation of rigorously balanced test sets (<xref ref-type="fig" rid="figure2">Figure 2</xref>) to ensure reliable evaluation, and the use of a heterogeneous data regimen during SFT to significantly reduce the risk of overfitting.</p><p>Another topic we would like to discuss is language generalizability. In this study, the model was exclusively trained and validated on medical reports in Chinese, and we acknowledge its potential limitations in generalizing across languages and health care systems. The unique expression logic, terminology, and writing structure inherent to Chinese clinical texts may restrict the model&#x2019;s direct applicability to other languages, such as English, or different medical environments. A noteworthy point for discussion is the systematic difference in linguistic precision between Chinese and English medical language. Chinese reports often use broad-meaning or imprecise terms; for instance, the word &#x201C;&#x4FB5;&#x72AF;&#x201D; can encompass various pathological states from adhesion to infiltration, with its specific interpretation highly dependent on context and the radiologist&#x2019;s personal expression habits. In contrast, the mature system of English medical literature, shaped by long-term standardization, tends to use more discriminative terminology, such as &#x201C;abutment,&#x201D; &#x201C;invasion,&#x201D; and &#x201C;encasement&#x201D; to describe different degrees of involvement. These terms often have more direct correspondence with definitions in the AJCC staging guidelines. This linguistic characteristic suggests that while our current model is specialized for the Chinese context, adapting its core competency to the potentially more standardized terminology of English environments might present relatively manageable semantic understanding challenges, and the model may need recalibration for health care systems with more precise reporting standards.</p></sec><sec id="s4-9"><title>Future Work</title><p>Building upon the generalizable foundation established in this work, our future research will focus on extending the model&#x2019;s robustness to multi-institutional environments. The strategies outlined below are presented as logical and powerful extensions of this study:</p><list list-type="order"><list-item><p>More collaboration: To directly assess and enhance performance across varied clinical settings, we plan to establish more collaborations to build larger, more representative datasets spanning different languages and health care systems. A key component will be the development of unified data inclusion standards and quality control metrics. We will continuously assess the collaboration&#x2019;s effectiveness and optimize the workflow and technical protocols to ensure the creation of high-quality, federated data resources.</p></list-item><list-item><p>Systematic, knowledge-guided data augmentation: We will develop a systematic data augmentation pipeline to explicitly train the model to increase invariance to clinical language variations. This will include:</p><list list-type="order"><list-item><p>Guideline-based semantic paraphrasing: Leveraging authoritative resources like the AJCC Cancer Staging Manual, we will systematically create a thesaurus of medical synonyms and sentence templates. For instance, for &#x2018;tumor invasion of the visceral pleura,&#x2019; we will generate medically accurate alternatives like &#x2018;microscopic involvement of the visceral pleura&#x2019; or &#x2018;lesion involving the visceral pleural surface.&#x2019; This directly enhances robustness to variations in physicians&#x2019; dictation styles.</p></list-item><list-item><p>LLM-assisted synthetic data generation: Using high-quality reports as seeds, we will use LLMs under strict constraints for conditional text generation. This will efficiently produce a large volume of coherent and logically sound synthetic reports. This approach will be particularly targeted towards under-represented or easily confused staging categories (eg, T1c vs T2a) identified in our error analysis, allowing us to strategically address specific model weaknesses rather than blindly expanding data.</p></list-item></list></list-item></list><p>In terms of language generalizability, to systematically enable effective deployment in broader scenarios, we propose the following three actionable technical pathways:</p><list list-type="order"><list-item><p>International collaboration: Establish collaborations with overseas medical institutions to acquire ethically approved, deidentified medical reports in English (or other languages). This will build a multi-center, cross-lingual dataset for model adaptation training and external validation, enhancing its ability to extract key information from reports with different structures and terminology.</p></list-item><list-item><p>Optimized modular and decoupled design: Decouple the model&#x2019;s language understanding component from its medical reasoning component. The language understanding module would focus on accurately extracting key medical entities and their attributes from raw text. The medical reasoning module, leveraging shared clinical knowledge, would then perform logical staging based on these structured findings, strictly adhering to TNM guidelines. This design ensures the reasoning component&#x2019;s generalizability across languages. When adapting to a new language, only the entity recognition capability of the language module needs training, significantly improving cross-lingual adaptability and reducing data requirements.</p></list-item><list-item><p>Construction of a cross-lingual clinical terminology mapping atlas: Systematically organize key phrases and imprecise terms related to TNM staging in specific language reports, establishing precise mappings to international standard terminologies like SNOMED CT (Systematized Nomenclature of Medicine - Clinical Terms) or Unified Medical Language System. This work will lay the foundation for achieving semantic alignment of clinical texts across languages.</p></list-item></list><p>In summary, by implementing this technical roadmap centered on global collaboration, modular design, and cross-lingual terminology alignment, we can systematically enhance the model&#x2019;s generalizability, ultimately developing it into a practical tool for global, multi-lingual clinical environments.</p><p>Given the inherent complexity and subjective nature of cancer staging, we recognize that a prospective comparison between the model&#x2019;s performance and the independent interpretations of multiple clinicians on new, unseen cases would provide a more ecologically valid assessment of its clinical utility. Such a study design, where the model&#x2019;s outputs and individual expert annotations are collected in parallel without a pre-established consensus, would more accurately simulate real-world deployment scenarios. This approach would not only allow for a robust benchmarking against the natural variation inherent in clinical practice but also help identify specific contexts where the model&#x2019;s reasoning aligns with or diverges from human experts. We plan to implement this in future research, potentially involving a larger and more diverse panel of annotators from multiple institutions. This direction is particularly significant for validating AI assistants in complex, subjective tasks like cancer staging, and its methodology could be extended to other high-stakes clinical decision support applications where interexpert variability is a key consideration.</p></sec><sec id="s4-10"><title>Conclusions</title><p>In conclusion, this study establishes a robust and efficient framework for automating TNM staging in NSCLC by leveraging the GLM-4-Air model enhanced through advanced prompt engineering and selectively applies SFT to reasoning-heavy tasks (T/N) while leveraging the baseline model for retrieval-centric tasks (M). This task-dependent optimization strategy highlights that the value of fine-tuning is not universal but varies with the nature of the clinical subtask, which is a key insight for future clinical AI development. The finalized model demonstrated superior performance over a leading commercial model, GPT-4o, achieving higher accuracy, particularly in complex staging decisions, while simultaneously reducing the incidence of clinically critical errors. Its design, which emphasizes reasoning transparency and adherence to clinical guidelines, ensures reliability. The exceptional cost efficiency of the solution further underscores its viability for scalable deployment.</p><p>This work also provides a validated pathway toward standardizing and augmenting cancer staging processes, with the immediate potential to improve consistency in treatment planning and to expand access to expert-level staging support in diverse health care settings.</p></sec></sec></body><back><ack><p>We are grateful to Ms Meirong Shi, Ms Yuhuai Li, Mr Min Liu, Mr Zhiqiang Guo, and Mr Jiong Shen for their support for this study.</p></ack><notes><sec><title>Funding</title><p>No external financial support or grants were received from any public, commercial, or not-for-profit entities for the research, authorship, or publication of this study.</p></sec><sec><title>Data Availability</title><p>The original deidentified clinical text dataset generated and analyzed during this study is not publicly available to protect patient privacy and in accordance with ethical approvals. However, a fully anonymized dataset of both the training and testing sets is available from the corresponding author upon reasonable request, subject to a Data Use Agreement. At this stage, a redacted example dataset comprising 5 deidentified cases is available in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. The optimized prompts used for the large language models in this study are provided in the <xref ref-type="supplementary-material" rid="app6">Multimedia Appendices 6</xref><xref ref-type="supplementary-material" rid="app7"/>-<xref ref-type="supplementary-material" rid="app8">8</xref>. The proprietary large language model (GLM-4-Air) used and fine-tuned in this study is a commercially licensed product, and its weights are not publicly distributable due to intellectual property restrictions. However, the complete training configuration, including hyperparameters, low-rank adaptation settings, and the fine-tuning pipeline, is described in the Methods section and <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>. This ensures that the methodology can be fully replicated using comparable open-source models within the same framework.</p></sec></notes><fn-group><fn fn-type="con"><p>C. Li and J.W. contributed to the conceptualization of the study. RJ was responsible for data curation and, together with C Ling, conducted the investigation. Formal analysis was performed by RJ, C Ling, and YH. The methodology was developed by C Li and C Ling. Project administration was managed by JW and C Li, while resources were provided by JW, NL, JH, and JS. Software development was carried out by C Ling, XR, CC, and SZ. Supervision was provided by C Li. Validation was conducted by RJ, YH, QW, and YL, and visualization was completed by YH and YS. The original draft was written by C Li and YS, and C Li, YH, NL, JH, and JS contributed to the review and editing of the manuscript. All authors contributed to the paper and approved the final submitted manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AJCC</term><def><p>American Joint Committee on Cancer</p></def></def-item><def-item><term id="abb3">CoT</term><def><p>chain-of-thought</p></def></def-item><def-item><term id="abb4">GLM</term><def><p>general language model</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb7">M</term><def><p>distant metastasis status</p></def></def-item><def-item><term id="abb8">N</term><def><p>regional lymph node involvement</p></def></def-item><def-item><term id="abb9">NSCLC</term><def><p>non-small cell lung cancer</p></def></def-item><def-item><term id="abb10">OCR</term><def><p>optical character recognition</p></def></def-item><def-item><term id="abb11">SFT</term><def><p>supervised fine-tuning</p></def></def-item><def-item><term id="abb12">SNOMED CT</term><def><p>Systematized Nomenclature of Medicine - Clinical Terms</p></def></def-item><def-item><term id="abb13">T</term><def><p>primary tumor characteristics</p></def></def-item><def-item><term id="abb14">TNM</term><def><p>tumor-node-metastasis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Cancer incidence and mortality in China, 2022</article-title><source>J Natl Cancer Cent</source><year>2024</year><month>03</month><volume>4</volume><issue>1</issue><fpage>47</fpage><lpage>53</lpage><pub-id pub-id-type="doi">10.1016/j.jncc.2024.01.006</pub-id><pub-id pub-id-type="medline">39036382</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bang</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J ju</given-names> </name><etal/></person-group><article-title>Transcriptome analysis of non-small cell lung cancer and genetically matched adjacent normal tissues identifies novel prognostic marker genes</article-title><source>Genes Genom</source><year>2017</year><month>03</month><volume>39</volume><issue>3</issue><fpage>277</fpage><lpage>284</lpage><pub-id pub-id-type="doi">10.1007/s13258-016-0492-5</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rueschhoff</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Moore</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Jasahui</surname><given-names>MRP</given-names> </name></person-group><article-title>Lung cancer staging&#x2014;a clinical practice review</article-title><source>JoR</source><year>2024</year><volume>4</volume><issue>1</issue><fpage>50</fpage><lpage>61</lpage><pub-id pub-id-type="doi">10.3390/jor4010005</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amin</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>FL</given-names> </name><name name-style="western"><surname>Edge</surname><given-names>SB</given-names> </name><etal/></person-group><article-title>The eighth edition AJCC cancer staging manual: continuing to build a bridge from a population-based to a more &#x201C;personalized&#x201D; approach to cancer staging</article-title><source>CA Cancer J Clin</source><year>2017</year><month>03</month><volume>67</volume><issue>2</issue><fpage>93</fpage><lpage>99</lpage><pub-id pub-id-type="doi">10.3322/caac.21388</pub-id><pub-id pub-id-type="medline">28094848</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Quality control and analysis of treatment for hospitalized cancer patients: an interview and medical records study from nine hospitals in Beijing</article-title><source>Med J Peking Union Med Coll Hosp</source><year>2025</year><volume>16</volume><issue>2</issue><fpage>399</fpage><lpage>405</lpage><pub-id pub-id-type="doi">10.12290/xhyxzz.2024-0265</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Puts</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nobel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zegers</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bermejo</surname><given-names>I</given-names> </name><name name-style="western"><surname>Robben</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dekker</surname><given-names>A</given-names> </name></person-group><article-title>How natural language processing can aid with pulmonary oncology tumor node metastasis staging from free-text radiology reports: algorithm development and validation</article-title><source>JMIR Form Res</source><year>2023</year><month>03</month><day>22</day><volume>7</volume><fpage>e38125</fpage><pub-id pub-id-type="doi">10.2196/38125</pub-id><pub-id pub-id-type="medline">36947118</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nobel</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Puts</surname><given-names>S</given-names> </name><name name-style="western"><surname>Krdzalic</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Natural language processing algorithm used for staging pulmonary oncology from free-text radiological reports: &#x201C;Including PET-CT and Validation Towards Clinical Use&#x201D;</article-title><source>J Imaging Inform Med</source><year>2024</year><month>02</month><volume>37</volume><issue>1</issue><fpage>3</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1007/s10278-023-00913-x</pub-id><pub-id pub-id-type="medline">38343237</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Park</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>Automated extraction of information of lung cancer staging from unstructured reports of PET-CT interpretation: natural language processing with deep-learning</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>09</month><day>1</day><volume>22</volume><issue>1</issue><fpage>229</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-01975-7</pub-id><pub-id pub-id-type="medline">36050674</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nobel</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Puts</surname><given-names>S</given-names> </name><name name-style="western"><surname>Weiss</surname><given-names>J</given-names> </name><etal/></person-group><article-title>T-staging pulmonary oncology from radiological reports using natural language processing: translating into a multi-language setting</article-title><source>Insights Imaging</source><year>2021</year><month>06</month><day>10</day><volume>12</volume><issue>1</issue><fpage>77</fpage><pub-id pub-id-type="doi">10.1186/s13244-021-01018-1</pub-id><pub-id pub-id-type="medline">34114076</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name></person-group><article-title>A novel deep learning approach to extract Chinese clinical entities for lung cancer screening and staging</article-title><source>BMC Med Inform Decis Mak</source><year>2021</year><month>07</month><day>30</day><volume>21</volume><issue>Suppl 2</issue><fpage>214</fpage><pub-id pub-id-type="doi">10.1186/s12911-021-01575-x</pub-id><pub-id pub-id-type="medline">34330277</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Goel</surname><given-names>MA</given-names> </name></person-group><article-title>LLMs accelerate annotation for medical information extraction</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 4, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.02296</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fink</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Bischoff</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fink</surname><given-names>CA</given-names> </name><etal/></person-group><article-title>Potential of ChatGPT and GPT-4 for data mining of free-text CT reports on lung cancer</article-title><source>Radiology</source><year>2023</year><month>09</month><volume>308</volume><issue>3</issue><fpage>310</fpage><lpage>319</lpage><pub-id pub-id-type="doi">10.1148/radiol.231362</pub-id><pub-id pub-id-type="medline">37724963</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Loeffler</surname><given-names>CM</given-names> </name><name name-style="western"><surname>M&#x00FC;ller-Franzes</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Extracting structured information from unstructured histopathology reports using generative pre-trained transformer 4 (GPT-4)</article-title><source>J Pathol</source><year>2024</year><month>03</month><volume>262</volume><issue>3</issue><fpage>310</fpage><lpage>319</lpage><pub-id pub-id-type="doi">10.1002/path.6232</pub-id><pub-id pub-id-type="medline">38098169</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abumelha</surname><given-names>M</given-names> </name><name name-style="western"><surname>Al-Ghamdi</surname><given-names>AAM</given-names> </name><name name-style="western"><surname>Fayoumi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ragab</surname><given-names>M</given-names> </name></person-group><article-title>Medical feature extraction from clinical examination notes: development and evaluation of a two-phase large language model framework</article-title><source>JMIR Med Inform</source><year>2025</year><month>12</month><day>3</day><volume>13</volume><fpage>e78432</fpage><pub-id pub-id-type="doi">10.2196/78432</pub-id><pub-id pub-id-type="medline">41171081</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ba</surname><given-names>Z</given-names> </name></person-group><article-title>Extracting multifaceted characteristics of patients with chronic disease comorbidity: framework development using large language models</article-title><source>JMIR Med Inform</source><year>2025</year><month>05</month><day>15</day><volume>13</volume><fpage>e70096</fpage><pub-id pub-id-type="doi">10.2196/70096</pub-id><pub-id pub-id-type="medline">40373298</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>ChatGPT for automating lung cancer staging: feasibility study on open radiology report dataset</article-title><source>medRxiv</source><comment>Preprint posted online on  Dec 3, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.12.11.23299107</pub-id><pub-id pub-id-type="medline">37662351</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Matsuo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Nishio</surname><given-names>M</given-names> </name><name name-style="western"><surname>Matsunaga</surname><given-names>T</given-names> </name><name name-style="western"><surname>Fujimoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Murakami</surname><given-names>T</given-names> </name></person-group><article-title>Exploring multilingual large language models for enhanced TNM classification of radiology report in lung cancer staging</article-title><source>Cancers (Basel)</source><year>2024</year><month>10</month><day>26</day><volume>16</volume><issue>21</issue><fpage>51</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.3390/cancers16213621</pub-id><pub-id pub-id-type="medline">39518061</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Fujimoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Nishio</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tanaka</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Classification of cancer TNM stage from Japanese radiology report using on-premise LLM at NTCIR-17 mednlp-SC RR- TNM subtask. NII institutional repository</article-title><source>NTCIR Conference on Evaluation of Information Access Technologies</source><year>2023</year><pub-id pub-id-type="doi">10.20736/0002001299</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suzuki</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yamada</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yamazaki</surname><given-names>H</given-names> </name><name name-style="western"><surname>Honda</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sakai</surname><given-names>S</given-names> </name></person-group><article-title>Preliminary assessment of TNM classification performance for pancreatic cancer in Japanese radiology reports using GPT-4</article-title><source>Jpn J Radiol</source><year>2025</year><month>01</month><volume>43</volume><issue>1</issue><fpage>51</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01643-y</pub-id><pub-id pub-id-type="medline">39162781</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Lucas</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Lu-Yao</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>CC</given-names> </name></person-group><article-title>Classifying cancer stage with open-source clinical large language models</article-title><conf-name>Orlando, FL, USA</conf-name><conf-date>Jun 3-6, 2024</conf-date><pub-id pub-id-type="doi">10.1109/ICHI61247.2024.00018</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nishio</surname><given-names>M</given-names> </name><name name-style="western"><surname>Matsuo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Matsunaga</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Zero-shot classification of TNM staging for Japanese radiology report using ChatGPT at RR-TNM subtask of NTCIR-17 mednlp-SC</article-title><source>Proceedings of the 17th NTCIR Conference on Evaluation of Information Access Technologies</source><year>2023</year><pub-id pub-id-type="doi">10.20736/0002001283</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="book"><article-title>General-purpose character recognition (high-precision version)</article-title><source>Tencent Cloud</source><access-date>2026-04-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cloud.tencent.com/document/product/866/34937">https://cloud.tencent.com/document/product/866/34937</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldstraw</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chansky</surname><given-names>K</given-names> </name><name name-style="western"><surname>Crowley</surname><given-names>J</given-names> </name><etal/></person-group><article-title>The IASLC lung cancer staging project: proposals for revision of the TNM stage groupings in the forthcoming (Eighth) edition of the TNM classification for lung cancer</article-title><source>J Thorac Oncol</source><year>2016</year><month>01</month><volume>11</volume><issue>1</issue><fpage>39</fpage><lpage>51</lpage><pub-id pub-id-type="doi">10.1016/j.jtho.2015.09.009</pub-id><pub-id pub-id-type="medline">26762738</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Attribution 4.0 international (CC BY 4.0)</article-title><source>Creative Commons</source><access-date>2026-03-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Team GLM: Zeng A, Xu B, Wang B, et al</collab></person-group><article-title>ChatGLM: a family of large language models from GLM-130B to GLM-4 all tools</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 18, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.12793</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>E</given-names> </name></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lakens</surname><given-names>D</given-names> </name></person-group><article-title>Calculating and reporting effect sizes to facilitate cumulative science: a practical primer for t-tests and ANOVAs</article-title><source>Front Psychol</source><year>2013</year><month>11</month><day>26</day><volume>4</volume><fpage>863</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2013.00863</pub-id><pub-id pub-id-type="medline">24324449</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><source>Statistical Power Analysis for the Behavioral Sciences</source><year>1988</year><pub-id pub-id-type="other">9780203771587</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Distribution of data set by imaging modality, language and hospital sources.</p><media xlink:href="ai_v5i1e77988_app1.xlsx" xlink:title="XLSX File, 15 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Distribution of T, N, M staging of the Data Set.</p><media xlink:href="ai_v5i1e77988_app2.xlsx" xlink:title="XLSX File, 14 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Domain of the Auxiliary Instruction Samples.</p><media xlink:href="ai_v5i1e77988_app3.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Result of post-OCR Chinese character error rate test and Examples of common OCR error types.</p><media xlink:href="ai_v5i1e77988_app4.pdf" xlink:title="PDF File, 235 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Deidentified example dataset.</p><media xlink:href="ai_v5i1e77988_app5.pdf" xlink:title="PDF File, 51 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Sample of T Staging.</p><media xlink:href="ai_v5i1e77988_app6.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Sample of N Staging.</p><media xlink:href="ai_v5i1e77988_app7.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Sample of M Staging.</p><media xlink:href="ai_v5i1e77988_app8.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Model Card.</p><media xlink:href="ai_v5i1e77988_app9.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>