<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Cardio</journal-id><journal-id journal-id-type="publisher-id">cardio</journal-id><journal-id journal-id-type="index">26</journal-id><journal-title>JMIR Cardio</journal-title><abbrev-journal-title>JMIR Cardio</abbrev-journal-title><issn pub-type="epub">2561-1011</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v10i1e89480</article-id><article-id pub-id-type="doi">10.2196/89480</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Extracting Cardiorespiratory Symptoms From Clinical Notes Using Open-Weight Large Language Models: Method Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Bai</surname><given-names>Yunbing</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cui</surname><given-names>Wanting</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Finkelstein</surname><given-names>Joseph</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Arizona Center for Telemedicine and Digital Health, College of Medicine, University of Arizona</institution><addr-line>1501 N Campbell Ave AHSL 1156</addr-line><addr-line>Tucson</addr-line><addr-line>AZ</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Garc&#x00ED;a-Barrag&#x00E1;n</surname><given-names>&#x00C1;lvaro</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Torii</surname><given-names>Manabu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yunbing Bai, MS, Arizona Center for Telemedicine and Digital Health, College of Medicine, University of Arizona, 1501 N Campbell Ave AHSL 1156, Tucson, AZ, 85724-5105, United States, 1 520-626-3944; <email>yunbing@arizona.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>22</day><month>5</month><year>2026</year></pub-date><volume>10</volume><elocation-id>e89480</elocation-id><history><date date-type="received"><day>11</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>28</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>28</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yunbing Bai, Wanting Cui, Joseph Finkelstein. Originally published in JMIR Cardio (<ext-link ext-link-type="uri" xlink:href="https://cardio.jmir.org">https://cardio.jmir.org</ext-link>), 22.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Cardio, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://cardio.jmir.org">https://cardio.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://cardio.jmir.org/2026/1/e89480"/><abstract><sec><title>Background</title><p>Accurate identification of clinical symptoms and signs (S&#x0026;S) is essential for the early detection of high-burden cardiorespiratory conditions, including lung cancer, chronic obstructive pulmonary disease, and heart failure. Although symptom data play a central role in diagnostic reasoning and predictive modeling, most S&#x0026;S information remains embedded in unstructured electronic health record notes, limiting their use in automated phenotyping, surveillance, and clinical decision support. Traditional natural language processing systems struggle with domain variability and contextual nuance in clinical text. Recent advances in large language models (LLMs) offer a promising alternative, yet challenges remain in hallucinations, overinference, and safe deployment. This study evaluated whether locally deployed open-source models could reliably extract cardiorespiratory S&#x0026;S and map them to <italic>ICD-10-CM</italic> (<italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>) codes using optimized prompting strategies.</p></sec><sec><title>Objective</title><p>This study aims to assess the accuracy of open-source LLMs in extracting explicitly stated cardiorespiratory S&#x0026;S from clinical notes and mapping them to <italic>ICD-10-CM</italic> codes (R00-R09) and to compare performance across 4 prompt-engineering strategies, including a multimodule LLM framework.</p></sec><sec sec-type="methods"><title>Methods</title><p>A total of 593 clinical notes from the MTSamples database were manually reviewed, with 93 notes used for prompt development and comparison using Llama 3.3-70B, and 500 notes used as testing data for the final best prompt setting using both Llama 3.3-70B and gpt-oss-120B. Four prompting conditions were evaluated: (1) instruction-only, (2) <italic>ICD-10-CM</italic> definition&#x2013;based prompts, (3) assumption-free prompts, and (4) a multimodule LLM framework with postprocessing. Performance was measured using precision, recall, and <italic>F</italic><sub>1</sub>-score for both S&#x0026;S extraction and <italic>ICD-10-CM</italic> code generation.</p></sec><sec sec-type="results"><title>Results</title><p>Across all prompt strategies, model performance improved as more structure and constraints were added. Instruction-only prompting demonstrated high recall but poor precision. Incorporating <italic>ICD-10-CM</italic> definitions improved coding accuracy, and assumption-free prompting further balanced precision and recall. The multimodule approach with postprocessing achieved the highest performance during prompt development. On the independent test corpus, entity-level microaveraged evaluation showed that gpt-oss-120B outperformed Llama 3.3-70B in both tasks. For S&#x0026;S extraction, Llama 3.3-70B achieved a precision of 0.63, a recall of 0.86, and an <italic>F</italic><sub>1</sub>-score of 0.73, whereas gpt-oss-120B achieved a precision of 0.89, a recall of 0.87, and an <italic>F</italic><sub>1</sub>-score of 0.88. For <italic>ICD-10-CM</italic> code mapping, Llama 3.3-70B achieved a precision of 0.59, a recall of 0.83, and an <italic>F</italic><sub>1</sub>-score of 0.69, whereas gpt-oss-120B achieved a precision of 0.90, a recall of 0.84, and an <italic>F</italic><sub>1</sub>-score of 0.87.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Locally deployed LLMs, when paired with optimized prompting and multimodule orchestration, can accurately extract cardiorespiratory S&#x0026;S and generate <italic>ICD-10-CM</italic> codes from unstructured clinical notes. This approach increases the level of data safety by enabling on-premises processing without external data transmission and demonstrates strong potential for scalable, domain-adaptive symptom extraction pipelines in biomedical informatics. Future work should expand datasets and evaluate generalizability across clinical domains.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>large language models</kwd><kwd>electronic health records</kwd><kwd>signs and symptoms</kwd><kwd>clinical coding</kwd><kwd>prompt engineering</kwd><kwd>named entity recognition</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Understanding the clinical symptoms and signs (S&#x0026;S) of high-burden cardiorespiratory conditions&#x2014;such as lung cancer, chronic obstructive pulmonary disease (COPD), and heart failure&#x2014;is essential for timely diagnosis, risk prediction, and improved patient outcomes. For example, persistent cough, dyspnea, chest discomfort, or hemoptysis can expedite lung cancer detection, allowing treatment at earlier stages when prognosis is more favorable [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. In heart failure, monitoring subtle changes in breathing patterns, exercise tolerance, or peripheral edema supports the early prediction of disease onset and the prevention of acute decompensations [<xref ref-type="bibr" rid="ref4">4</xref>]. Similarly, the early identification of worsening respiratory symptoms&#x2014;such as increased breathlessness or sputum production&#x2014;can help predict and prevent COPD exacerbations, a leading cause of hospitalization and mortality [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>In clinical practice, symptom data play a pivotal role in both diagnostic decision-making and predictive modeling. Hospital readmission risk models often incorporate structured and unstructured symptom information to identify patients at higher risk for rehospitalization [<xref ref-type="bibr" rid="ref6">6</xref>]. Predictive algorithms for heart failure onset and COPD exacerbation frequently rely on longitudinal patterns in clinical data&#x2014;including symptom trajectories&#x2014;to generate early alerts for clinicians [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Likewise, lung cancer diagnostic pathways integrate symptom reports with imaging and laboratory findings to guide further evaluation [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Despite their importance, most S&#x0026;S are documented in unstructured free-text clinical notes rather than structured electronic health record fields. These unstructured data contain nuanced clinical observations valuable for predicting lung cancer, heart failure onset, COPD exacerbation, and hospital readmission risk. However, the complexity, variability, and domain-specific language of clinical text pose significant challenges for traditional natural language processing (NLP) methods [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Advances in large language models (LLMs) have shown promise in overcoming these challenges, enabling more accurate and timely extraction of clinically relevant information across diverse documentation styles and supporting predictive analytics in high-impact conditions [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>To address these challenges, specialized biomedical NLP models such as BioBERT and BioRAG have been developed to increase accuracy in symptom identification [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. These models are particularly effective in detecting symptoms associated with chronic conditions and cancers, making them valuable tools for early disease prediction. However, their dependency on task-specific training datasets can restrict adaptability to new medical contexts. Advanced generative models, such as GPT-4o, have shown promise in identifying symptoms across clinical domains and require little pretraining [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. However, most models are for general purposes that are not fully suitable for medical tasks. In addition, while these models are convenient to use, they often run on commercial cloud platforms, which can raise serious privacy concerns when handling sensitive patient data.</p><p>In the <italic>ICD-10-CM</italic> (<italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>) coding system, there is a dedicated section for S&#x0026;S, ranging from R00 to R99 (symptoms, signs, and abnormal clinical and laboratory findings). Our previous studies showed that some LLMs, capable of being deployed and operated locally, demonstrated strong performance in extracting S&#x0026;S and mapping them to <italic>ICD-10-CM</italic> codes within the genitourinary system, specifically in the range of R30 to R39 (symptoms and signs involving the genitourinary system) [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. However, as task complexity increased, model performance became unstable. In this study, we aim to explore the feasibility of using LLMs with different prompt-engineering strategies to accurately extract cardiorespiratory-related S&#x0026;S from medical text, map the extracted S&#x0026;S to standardized <italic>ICD-10-CM</italic> codes in the range of R00 to R09 (symptoms and signs involving the circulatory and respiratory systems), and generate structured outputs to support automated evaluation.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Source</title><p>This study conducted a comprehensive analysis of clinical notes sourced from the MTSamples database [<xref ref-type="bibr" rid="ref16">16</xref>], with a specific focus on records related to medical conditions. The initial phase involved identifying notes that contained keywords associated with cardiorespiratory disorders. These notes underwent manual review, during which any entries that did not explicitly describe clinical S&#x0026;S were excluded. After this process, a final dataset comprising 93 notes was selected. These notes were retained in their original form, without any textual modifications. For evaluation purposes, annotations provided by clinical experts served as the gold standard. We only extracted and annotated S&#x0026;S with <italic>ICD-10-CM</italic> codes related to the cardiorespiratory system that were able to map to <italic>ICD-10-CM</italic> codes: R00 to R09 [<xref ref-type="bibr" rid="ref17">17</xref>]. These 93 cardiorespiratory clinical notes served as the training data for prompt development.</p><p>An additional 500 expert-labeled clinical notes from the MTSamples database were used as the test dataset. Unlike the training data, the test notes were not limited to the cardiorespiratory domain; instead, they included S&#x0026;S spanning multiple clinical systems. This design more closely reflects real-world clinical documentation and enables a more realistic evaluation of model performance.</p><p>Each clinical note was annotated by 2 individuals including a nurse and a clinical informatician with experience in medical coding. Cohen &#x03BA; coefficient was used to evaluate the agreement between 2 annotators as previously described. The interrater reliability based on the Cohen &#x03BA; coefficient was 0.75 (SD 0.06) in this study.</p></sec><sec id="s2-2"><title>Prompt Engineering and LLM</title><p>In this study, we selected 2 open-weight LLMs for evaluation: Llama 3.3-70B and gpt-oss-120B. Prompt development was conducted using Llama 3.3-70B for iterative prompt engineering and method design [<xref ref-type="bibr" rid="ref18">18</xref>]. We chose Llama 3.3-70B based on our previous studies, in which it demonstrated strong performance in extracting clinical S&#x0026;S from medical notes [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>After the final prompt framework was established, we evaluated the proposed approach using both Llama 3.3-70B and gpt-oss-120B to assess whether the workflow generalized across different open-weight models [<xref ref-type="bibr" rid="ref19">19</xref>]. For both models, the temperature was set to 0, and all other hyperparameters were kept at their default settings during evaluation.</p><p>We conducted all experiments on a local workstation equipped with a 32-core 2.50 GHz Intel Xeon W7-3565X processor, 256 GB of RAM, and 2 NVIDIA RTX PRO 6000 Blackwell Max-Q graphics cards with a total of 192 GB of graphics processing unit memory. Both models were deployed locally using Ollama, a lightweight and extensible framework for local LLM deployment. Llama 3.3-70B was run using Q4_K_M quantization, whereas gpt-oss-120B was run using MXFP4 quantization [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. No central processing unit offloading was used during inference. The approximate inference speed was 27.88 tokens/s for Llama 3.3-70B and 152.73 tokens/s for gpt-oss-120B. Although gpt-oss-120B has a larger total parameter count than Llama 3.3-70B, its faster throughput on the same hardware is not necessarily unexpected because gpt-oss-120B is a sparse mixture-of-experts model, meaning that only a subset of parameters is active for each token during inference. By contrast, Llama 3.3-70B is a dense model. The difference may also reflect the deployment formats used in this study, as gpt-oss-120B was run in MXFP4 and Llama 3.3-70B in Q4_K_M.</p><p>The prompt was designed to accomplish 2 primary tasks: extracting the exact S&#x0026;S from clinical notes and accurately mapping the extracted S&#x0026;S to the correct <italic>ICD-10-CM</italic> code groups (R00-R09). To achieve these objectives, multiple prompt structures were tested before selecting an instruction-based prompting approach. The prompt template we applied organized the prompt into 3 distinct sections: &#x201C;TASK,&#x201D; &#x201C;REQUIREMENT,&#x201D; and &#x201C;CLINICAL NOTE.&#x201D; The TASK section clearly defined the objective, instructing the model to extract S&#x0026;S explicitly mentioned in the clinical notes and associate them with the correct <italic>ICD-10-CM</italic> codes. The REQUIREMENT section provided specific constraints to guide the extraction process. The CLINICAL NOTE section contained the original medical transcription text, from which the model was expected to extract relevant information.</p><p>We iterated through 4 generations of prompts, aiming to improve the model&#x2019;s extraction of clinical S&#x0026;S, as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. In our initial approach, we provided the task background and goals within the prompt to evaluate the feasibility and general performance of the model without further instructions on extracting S&#x0026;S or converting them to <italic>ICD-10-CM</italic> codes.</p><p>Then, in the second generation, we provided the model with a list of <italic>ICD-10-CM</italic> codes and their definitions within the prompt. To balance the need for completeness while controlling the input length, we limited the <italic>ICD-10-CM</italic> codes to those within the range R00 to R09, specifically including 3-character and 4-character codes (R00-R09 and R00.0-R09.9). These codes corresponded to &#x201C;symptoms and signs involving the circulatory and respiratory systems&#x201D; (<xref ref-type="table" rid="table1">Table 1</xref>). In addition, clinical notes often contained negative S&#x0026;S, such as &#x201C;No cough, shortness of breath, fever, or chills.&#x201D; To ensure that only positively reported S&#x0026;S were extracted, we explicitly instructed the model to exclude any negative S&#x0026;S from its output. In the third iteration, we aimed to reduce hallucinations by explicitly instructing the model to extract only S&#x0026;S explicitly mentioned in the clinical notes and to avoid making assumptions about content not explicitly stated.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the 4 prompt-engineering iterations evaluated in this retrospective methodological study for extracting cardiorespiratory symptoms and signs from unstructured clinical notes. The study used deidentified medical transcription notes from the publicly available MTSamples database. Iterations progressed from instruction-only prompting to <italic>ICD-10-CM</italic> (<italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>) definition&#x2013;based prompts, assumption-free constraints, and finally a multimodule large language model framework with role separation (extraction module and refinement module). Each iteration introduced additional constraints to reduce hallucinations, limit inference, and improve structured <italic>ICD-10-CM</italic> code generation (R00-R09). Bolded elements indicate newly added prompt components at each iteration.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="cardio_v10i1e89480_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p><italic>ICD-10-CM</italic><sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> symptom and sign codes (R00-R09) and corresponding definitions related to the circulatory and respiratory systems that were explicitly provided to the large language models during prompt engineering<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"><italic>ICD-10-CM</italic> code</td><td align="left" valign="bottom">Definition</td></tr></thead><tbody><tr><td align="left" valign="top">R00</td><td align="left" valign="top">Abnormalities of heart beat</td></tr><tr><td align="left" valign="top">R00.0</td><td align="left" valign="top">Tachycardia, unspecified</td></tr><tr><td align="left" valign="top">R00.1</td><td align="left" valign="top">Bradycardia, unspecified</td></tr><tr><td align="left" valign="top">R00.2</td><td align="left" valign="top">Palpitations</td></tr><tr><td align="left" valign="top">R00.8</td><td align="left" valign="top">Other abnormalities of heart beat</td></tr><tr><td align="left" valign="top">R00.9</td><td align="left" valign="top">Unspecified abnormalities of heart beat</td></tr><tr><td align="left" valign="top">R01</td><td align="left" valign="top">Cardiac murmurs and other cardiac sounds</td></tr><tr><td align="left" valign="top">R01.0</td><td align="left" valign="top">Benign and innocent cardiac murmurs</td></tr><tr><td align="left" valign="top">R01.1</td><td align="left" valign="top">Cardiac murmur, unspecified</td></tr><tr><td align="left" valign="top">R01.2</td><td align="left" valign="top">Other cardiac sounds</td></tr><tr><td align="left" valign="top">R03</td><td align="left" valign="top">Abnormal blood pressure reading, without diagnosis</td></tr><tr><td align="left" valign="top">R03.0</td><td align="left" valign="top">Elevated blood pressure reading, without diagnosis of hypertension</td></tr><tr><td align="left" valign="top">R03.1</td><td align="left" valign="top">Nonspecific low blood pressure reading</td></tr><tr><td align="left" valign="top">R04</td><td align="left" valign="top">Hemorrhage from respiratory passages</td></tr><tr><td align="left" valign="top">R04.0</td><td align="left" valign="top">Epistaxis</td></tr><tr><td align="left" valign="top">R04.1</td><td align="left" valign="top">Hemorrhage from throat</td></tr><tr><td align="left" valign="top">R04.2</td><td align="left" valign="top">Hemoptysis</td></tr><tr><td align="left" valign="top">R04.8</td><td align="left" valign="top">Hemorrhage from other sites in respiratory passages</td></tr><tr><td align="left" valign="top">R04.9</td><td align="left" valign="top">Hemorrhage from respiratory passages, unspecified</td></tr><tr><td align="left" valign="top">R05</td><td align="left" valign="top">Cough</td></tr><tr><td align="left" valign="top">R05.1</td><td align="left" valign="top">Acute cough</td></tr><tr><td align="left" valign="top">R05.2</td><td align="left" valign="top">Subacute cough</td></tr><tr><td align="left" valign="top">R05.3</td><td align="left" valign="top">Chronic cough</td></tr><tr><td align="left" valign="top">R05.4</td><td align="left" valign="top">Cough syncope</td></tr><tr><td align="left" valign="top">R05.8</td><td align="left" valign="top">Other specified cough</td></tr><tr><td align="left" valign="top">R05.9</td><td align="left" valign="top">Cough, unspecified</td></tr><tr><td align="left" valign="top">R06</td><td align="left" valign="top">Abnormalities of breathing</td></tr><tr><td align="left" valign="top">R06.0</td><td align="left" valign="top">Dyspnea</td></tr><tr><td align="left" valign="top">R06.1</td><td align="left" valign="top">Stridor</td></tr><tr><td align="left" valign="top">R06.2</td><td align="left" valign="top">Wheezing</td></tr><tr><td align="left" valign="top">R06.3</td><td align="left" valign="top">Periodic breathing</td></tr><tr><td align="left" valign="top">R06.4</td><td align="left" valign="top">Hyperventilation</td></tr><tr><td align="left" valign="top">R06.5</td><td align="left" valign="top">Mouth breathing</td></tr><tr><td align="left" valign="top">R06.6</td><td align="left" valign="top">Hiccough</td></tr><tr><td align="left" valign="top">R06.7</td><td align="left" valign="top">Sneezing</td></tr><tr><td align="left" valign="top">R06.8</td><td align="left" valign="top">Other abnormalities of breathing</td></tr><tr><td align="left" valign="top">R06.9</td><td align="left" valign="top">Unspecified abnormalities of breathing</td></tr><tr><td align="left" valign="top">R07</td><td align="left" valign="top">Pain in throat and chest</td></tr><tr><td align="left" valign="top">R07.0</td><td align="left" valign="top">Pain in throat</td></tr><tr><td align="left" valign="top">R07.1</td><td align="left" valign="top">Chest pain on breathing</td></tr><tr><td align="left" valign="top">R07.2</td><td align="left" valign="top">Precordial pain</td></tr><tr><td align="left" valign="top">R07.8</td><td align="left" valign="top">Other chest pain</td></tr><tr><td align="left" valign="top">R07.9</td><td align="left" valign="top">Chest pain, unspecified</td></tr><tr><td align="left" valign="top">R09</td><td align="left" valign="top">Other symptoms and signs involving the circulatory and respiratory system</td></tr><tr><td align="left" valign="top">R09.0</td><td align="left" valign="top">Asphyxia and hypoxemia</td></tr><tr><td align="left" valign="top">R09.1</td><td align="left" valign="top">Pleurisy</td></tr><tr><td align="left" valign="top">R09.2</td><td align="left" valign="top">Respiratory arrest</td></tr><tr><td align="left" valign="top">R09.3</td><td align="left" valign="top">Abnormal sputum</td></tr><tr><td align="left" valign="top">R09.8</td><td align="left" valign="top">Other specified symptoms and signs involving the circulatory and respiratory systems</td></tr><tr><td align="left" valign="top">R09.A</td><td align="left" valign="top">Foreign body sensation of the circulatory and respiratory system</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup><italic>ICD-10-CM</italic>: <italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>.</p></fn><fn id="table1fn2"><p><sup>b</sup>These codes represent standardized clinical symptom categories used for mapping extracted symptoms and signs in a retrospective analysis of deidentified clinical notes from the MTSamples database. Only 3-character and 4-character <italic>ICD-10-CM</italic> codes within the R00 to R09 range were included to limit the input length of models.</p></fn></table-wrap-foot></table-wrap><p>In the fourth iteration, we aimed to prompt the model to produce structured responses that would enable automated evaluation. However, to avoid increasing task complexity and causing unstable outputs, we adopted a multimodule LLM approach in which specialized modules (eg, extraction and refinement) collaborate through role-based interactions to decompose the task and generate standardized outputs&#x2014;an approach that has been shown to improve reliability and orchestration in complex workflows [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Specifically, we used 2 modules: an extraction module (EM) and a refinement module (RM). The EM was responsible for identifying all valid S&#x0026;S explicitly mentioned in the clinical notes. This task consisted of two subtasks: (1) extracting candidate S&#x0026;S and (2) filtering out false positives. The definitions of S&#x0026;S from the National Cancer Institute were provided in the prompt, and the model was instructed to include only explicitly stated S&#x0026;S and to exclude negated or denied findings. During preliminary testing, the model occasionally interpreted examination results&#x2014;such as blood pressure values&#x2014;as clinical signs. To mitigate this issue, we further instructed the model to exclude examination findings that contained numerical values.</p><p>The RM was responsible for mapping the extracted S&#x0026;S to <italic>ICD-10-CM</italic> codes in the range R00 to R09. For this module, we provided a predefined list of <italic>ICD-10-CM</italic> codes within this range, along with the required output format. The RM ingested the unstructured plain text output generated by the EM, selected S&#x0026;S that matched the provided code list, and returned the results in the specified format.</p><p>Following model inference, we performed postprocessing and data cleaning on the RM&#x2019;s structured outputs to eliminate assumption-based results. Based on a review of outputs from earlier iterations, we developed a red-flag keyword list (<xref ref-type="other" rid="box1">Textbox 1</xref>). Any output entries containing terms from this list were flagged as high risk for false positives due to assumptions, inclusion of negated S&#x0026;S, or hallucinations. All such entries were subsequently removed from the final results.</p><boxed-text id="box1"><title> Red-flag keyword list used during postprocessing to identify and remove high-risk false-positive outputs generated by the multimodule large language model (LLM) framework. Keywords reflect assumption-based language, negation, inference, or nonexplicit symptom reporting. Any extracted symptom or <italic>ICD-10-CM</italic> (<italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>) code description containing one or more of these terms was excluded from final results.</title><p><bold>Red-flag words</bold></p><list list-type="bullet"><list-item><p>Possible</p></list-item><list-item><p>No</p></list-item><list-item><p>Related</p></list-item><list-item><p>Mentioned</p></list-item><list-item><p>Not present</p></list-item><list-item><p>Appears</p></list-item><list-item><p>Implied</p></list-item><list-item><p>Normal</p></list-item><list-item><p>Considered</p></list-item><list-item><p>Involving</p></list-item><list-item><p>Explicitly</p></list-item></list></boxed-text></sec><sec id="s2-3"><title>Evaluation</title><p>The model&#x2019;s performance was assessed through a 2-part evaluation for each clinical note. First, the accuracy of extracted S&#x0026;S was measured to determine how well the model identified only the explicitly stated S&#x0026;S from the text. Second, the accuracy of generated <italic>ICD-10-CM</italic> codes was evaluated to assess the model&#x2019;s ability to correctly map extracted S&#x0026;S to the appropriate <italic>ICD-10-CM</italic> categories.</p><p>To quantify performance, precision, recall, and <italic>F</italic><sub>1</sub>-score were used, with human expert annotations serving as the gold standard for comparison. Precision calculated how many predicted S&#x0026;S were correct, and recall calculated how many ground-truth S&#x0026;S were recognized. Model outputs were evaluated at the individual clinical note level, and average performance metrics were then calculated across all notes. For <italic>ICD-10-CM</italic> code evaluation, only the first 3 digits of the code were considered to account for variations in granularity. For example, R01, R01.1, and R01.21 were all treated as R01, ensuring consistency in assessment.</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mo>=</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">T</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">u</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">F</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">N</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">g</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">t</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">v</mml:mi><mml:mi mathvariant="normal">e</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><disp-formula id="equWL3"><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi><mml:mi mathvariant="normal">r</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">s</mml:mi><mml:mi mathvariant="normal">i</mml:mi><mml:mi mathvariant="normal">o</mml:mi><mml:mi mathvariant="normal">n</mml:mi><mml:mo>+</mml:mo><mml:mi mathvariant="normal">R</mml:mi><mml:mi mathvariant="normal">e</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">l</mml:mi><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Prompt development was conducted using Llama 3.3-70B. Accordingly, performance on the prompt development dataset was compared across prompt iterations to assess how changes in prompt design influenced model outputs. To reduce the risk of overfitting the final framework to either the development data or the model used for prompt engineering, both Llama 3.3-70B and gpt-oss-120B were evaluated on the independent test dataset.</p><p>Because the test dataset included notes from multiple clinical systems rather than only cardiorespiratory conditions, many notes contained no annotated cardiorespiratory S&#x0026;S. In an initial note-level evaluation, notes with both empty ground-truth labels and empty model outputs were treated as correct matches. However, because this macro-averaging approach may inflate overall performance in a sparse dataset, we additionally calculated precision, recall, and <italic>F</italic><sub>1</sub>-score using standard entity-level microaveraging across the corpus.</p><p>For microaveraged evaluation, predicted and reference labels were first reduced to unique sets within each clinical note. True positives were defined as labels present in both the model output and expert annotation for the same note, false positives as labels predicted by the model but absent from the reference set, and false negatives as reference labels missed by the model. These counts were then summed across all notes, and precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated from the aggregated totals. For <italic>ICD-10-CM</italic> code evaluation, only the first 3 characters of each code were used in this corpus-level analysis. Notes with no gold labels and no predicted labels did not contribute to true-positive counts and were not assigned perfect scores.</p><p>In addition to reporting overall performance, we evaluated the model separately for each primary <italic>ICD-10-CM</italic> category within the R00 to R09 range. For this subgroup analysis, both predicted and reference codes were grouped at the 3-character category level (eg, R00, R01, R03, R04, R05, R06, R07, and R09), and precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated independently for each category. Within each category, true positives represented correctly predicted codes that matched the expert annotation for the same clinical note, false positives represented incorrectly predicted codes assigned to that category, and false negatives represented annotated codes in that category that were not identified by the model. This analysis was included to characterize variation in performance across individual cardiorespiratory <italic>ICD-10-CM</italic> subcategories.</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study did not involve human participants or the collection of identifiable private information. All analyses were conducted using secondary data from the publicly available MTSamples database, which consists of deidentified clinical transcription samples created for educational and research purposes [<xref ref-type="bibr" rid="ref16">16</xref>]. No patient identifiers were accessed, recorded, or analyzed, and no attempt was made to reidentify individuals. In accordance with journal guidelines, this research does not meet the definition of human subjects research and was therefore exempt from institutional review board review and informed consent requirements. The study was conducted in compliance with applicable ethical standards for research using deidentified secondary data.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Among the 93 human-annotated clinical notes, a total of 168 S&#x0026;S related to the cardiorespiratory system were identified. On average, each note contained 1.81 (SD 1.09) labeled symptoms. The most frequently observed <italic>ICD-10-CM</italic> category was R06&#x2014;Abnormalities of breathing, present in 52 notes, followed by R01&#x2014;Cardiac murmurs and other cardiac sounds (31 notes), R00&#x2014;Abnormalities of heart beat (26 notes), and R07&#x2014;Pain in throat and chest (25 notes) [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>Performance varied across the 4 prompt settings evaluated (<xref ref-type="table" rid="table2">Table 2</xref>). The instruction-only prompt resulted in high recall but low precision in both tasks, yielding an <italic>F</italic><sub>1</sub>-score of 0.54 for S&#x0026;S extraction and 0.41 for <italic>ICD-10-CM</italic> code generation. Introducing <italic>ICD-10-CM</italic> definitions improved both precision and <italic>F</italic><sub>1</sub>-scores, especially for code generation (<italic>F</italic><sub>1</sub>-score=0.70), though a slight reduction in recall was observed in extraction. Further improvement was seen when assumption-free constraints were added, increasing <italic>F</italic><sub>1</sub>-scores to 0.69 for extraction and 0.74 for code generation, with more balanced precision-recall trade-offs.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance comparison of 4 prompt-engineering strategies for extracting cardiorespiratory symptoms and signs and generating corresponding <italic>ICD-10-CM</italic><sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> codes (R00-R09).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Prompt setting</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">S&#x0026;S<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup> extraction</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Instruction only</td><td align="left" valign="top">0.38</td><td align="left" valign="top">0.92</td><td align="left" valign="top">0.54</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>ICD-10-CM</italic> code&#x2013;based symptom extraction</td><td align="left" valign="top">0.51</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.64</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Assumption-free constraints</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.69</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Multimodule LLMs<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.86</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Multimodule LLMs with postdata cleaning</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.90</td></tr><tr><td align="left" valign="top" colspan="4"><italic>ICD-10-CM</italic> code generation</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Instruction only</td><td align="left" valign="top">0.29</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.41</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>ICD-10-CM</italic> code&#x2013;based symptom extraction</td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.70</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Assumption-free constraints</td><td align="left" valign="top">0.69</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.74</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Multimodule LLMs</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.87</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Multimodule LLMs with postdata cleaning</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.95</td><td align="left" valign="top">0.89</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup><italic>ICD-10-CM</italic>: <italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>.</p></fn><fn id="table2fn2"><p><sup>b</sup>Precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated at the clinical note level and averaged across notes. The results are shown separately for S&#x0026;S extraction and <italic>ICD-10-CM</italic> code generation, demonstrating progressive performance improvements, as additional constraints and a multimodule architecture with postprocessing were introduced.</p></fn><fn id="table2fn3"><p><sup>c</sup>S&#x0026;S: symptom and sign.</p></fn><fn id="table2fn4"><p><sup>d</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap><p>The highest performance was achieved using a multimodule LLM framework with postdata cleaning. This setting produced an <italic>F</italic><sub>1</sub>-score of 0.90 for S&#x0026;S extraction and 0.89 for <italic>ICD-10-CM</italic> code generation, reflecting strong gains in both precision and recall. These results demonstrate the progressive effectiveness of structured prompting and refinement strategies in improving model outputs across clinical NLP tasks.</p><p>Overall, the multimodule architecture demonstrated superior performance. Without postprocessing, the model achieved an <italic>F</italic><sub>1</sub>-score of 0.86 for S&#x0026;S extraction and 0.87 for <italic>ICD-10-CM</italic> code generation, indicating substantial improvements in both precision and recall. However, the model occasionally produced unstable outputs containing assumption-based errors. Incorporating postprocessing and data-cleaning steps eliminated these assumptions from the multimodule outputs and further improved performance, yielding <italic>F</italic><sub>1</sub>-scores of 0.90 for S&#x0026;S extraction and 0.89 for <italic>ICD-10-CM</italic> code generation.</p><p>In the test dataset, a total of 1601 S&#x0026;S were labeled. On average, each clinical note contained 3.20 (SD 2.24) labeled S&#x0026;S. Among these notes, 69 contained at least one cardiorespiratory symptom or sign, with an average of 1.33 (SD 0.70) cardiorespiratory S&#x0026;S per note. The multimodule LLM framework with postprocessing was evaluated on this dataset using both Llama 3.3-70B and gpt-oss-120B (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Evaluation results of the proposed multimodule large language model framework with postprocessing data cleaning for extracting cardiorespiratory symptoms and signs (S&#x0026;S) and generating corresponding <italic>ICD-10-CM</italic><sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> codes (R00-R09)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Data and model</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">All testing clinical notes (n=500)</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>S&#x0026;S<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> extraction</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.3-70B</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>gpt-oss-120B</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.97</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>ICD-10-CM</italic> code generation</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.3-70B</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>gpt-oss-120B</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.98</td></tr><tr><td align="left" valign="top" colspan="5">Testing notes with cardiorespiratory S&#x0026;S (n=69)</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>S&#x0026;S extraction</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.3-70B</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.86</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>gpt-oss-120B</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.91</td></tr><tr><td align="left" valign="top" colspan="5"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>ICD-10-CM</italic> code generation</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.3-70B</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.87</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>gpt-oss-120B</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.87</td><td align="left" valign="top">0.92</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup><italic>ICD-10-CM</italic>: <italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>.</p></fn><fn id="table3fn2"><p><sup>b</sup>This retrospective evaluation was conducted using an independent test dataset of 500 deidentified clinical notes from the publicly available MTSamples database. The test dataset included notes spanning multiple clinical systems, of which a subset contained cardiorespiratory symptoms. Performance is reported separately for (1) all test notes (n=500) and (2) notes containing at least one cardiorespiratory S&#x0026;S (n=69). Two locally deployable, open-weight large language models&#x2014;Llama 3.3-70B and gpt-oss-120B&#x2014;were evaluated using identical multimodule workflows and postprocessing rules. Precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated at the clinical note level and macroaveraged across notes. This table demonstrates the robustness and cross-model generalizability of the proposed strategy for cardiorespiratory symptom extraction and <italic>ICD-10-CM</italic> mapping.</p></fn><fn id="table3fn3"><p><sup>c</sup>S&#x0026;S: symptoms and signs.</p></fn></table-wrap-foot></table-wrap><p>For Llama 3.3-70B, performance on notes containing cardiorespiratory S&#x0026;S was comparable to that observed in the prompt development dataset. gpt-oss-120B, which is a more recent model with a larger number of parameters than Llama 3.3-70B, achieved overall better performance. This result demonstrates the strong generalizability of the proposed multimodule LLM framework across different model architectures.</p><p>Under standard entity-level microaveraging across the full test corpus (<xref ref-type="table" rid="table4">Table 4</xref>), performance was lower than in the note-level evaluation. gpt-oss-120B outperformed Llama 3.3-70B in both tasks, achieving an <italic>F</italic><sub>1</sub>-score of 0.88 for S&#x0026;S extraction and 0.87 for <italic>ICD-10-CM</italic> code generation, compared with 0.73 and 0.69, respectively, for Llama 3.3-70B. These findings indicate that although note-level results were high, corpus-level microaveraged evaluation provided a more conservative estimate of extraction and coding performance.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Standard entity-level microaveraged performance across the independent test corpus (n=500)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">S&#x0026;S<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> extraction</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.3-70B</td><td align="left" valign="top">0.63</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.73</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>gpt-oss-120B</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.87</td><td align="left" valign="top">0.88</td></tr><tr><td align="left" valign="top" colspan="4"><italic>ICD-10-CM</italic><sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> code generation</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama 3.3-70B</td><td align="left" valign="top">0.59</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.69</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>gpt-oss-120B</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.84</td><td align="left" valign="top">0.87</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated by summing true positives, false positives, and false negatives across all notes after reducing predicted and reference labels to unique sets within each note. The results are shown separately for symptoms and signs extraction and <italic>ICD-10-CM</italic> code generation for Llama 3.3-70B and gpt-oss-120B. <italic>ICD-10-CM</italic> codes were evaluated at the 3-character category level.</p></fn><fn id="table4fn2"><p><sup>b</sup>S&#x0026;S: symptoms and signs.</p></fn><fn id="table4fn3"><p><sup>c</sup><italic>ICD-10-CM</italic>: <italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>.</p></fn></table-wrap-foot></table-wrap><p>Category-specific analysis (<xref ref-type="table" rid="table5">Table 5</xref>) showed that model performance varied across individual <italic>ICD-10-CM</italic> groups. For gpt-oss-120B, the highest <italic>F</italic><sub>1</sub>-scores were observed for R05 (cough; <italic>F</italic><sub>1</sub>-score=1.00), R06 (abnormalities of breathing; <italic>F</italic><sub>1</sub>-score=0.96), and R04 (hemorrhage from respiratory passages; <italic>F</italic><sub>1</sub>-score=0.94), whereas lower performance was seen for R09 (other circulatory and respiratory symptoms and signs; F1=0.71) and R03 (abnormal blood-pressure reading; <italic>F</italic><sub>1</sub>-score=0.73). Llama 3.3-70B showed a similar pattern of stronger performance in more common and clinically explicit categories such as R06 (<italic>F</italic><sub>1</sub>-score=0.83) and R04 (<italic>F</italic><sub>1</sub>-score=0.80), but lower performance in R03 (<italic>F</italic><sub>1</sub>-score=0.33) and R01 (cardiac murmurs and other cardiac sounds; <italic>F</italic><sub>1</sub>-score=0.40). These results suggest that model accuracy was higher for more explicit and frequently occurring symptom categories, while broader or less frequently represented categories remained more challenging.</p><p>For Llama 3.3-70B, 452 (90.4%) of 500 test clinical notes were correctly matched to the expert labels, leaving 48 (9.6%) notes with at least 1 error. Among these 48 error notes, 26 (54.2%) contained hallucinated S&#x0026;S, 10 (20.8%) included S&#x0026;S or diagnoses from noncardiorespiratory systems that were incorrectly identified as cardiorespiratory S&#x0026;S, and 14 (29.2%) missed at least 1 true cardiorespiratory S&#x0026;S present in the clinical note. These error categories were not mutually exclusive, and some notes contained more than 1 error type. For gpt-oss-120B, 477 (95.4%) of 500 test clinical notes were correctly matched to the expert labels, leaving 23 (4.6%) notes with at least 1 error. Among these 23 error notes, 4 (17.4%) contained hallucinated S&#x0026;S, 7 (30.4%) included S&#x0026;S or diagnoses from noncardiorespiratory systems that were incorrectly identified as cardiorespiratory S&#x0026;S, and 12 (52.2%) missed at least 1 true cardiorespiratory S&#x0026;S present in the clinical note. As with Llama 3.3-70B, these categories were not mutually exclusive. The most frequent errors reflected assumption-based interpretation, particularly inferring increased or decreased blood pressure from numerical blood pressure readings alone.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Category-specific <italic>ICD-10-CM</italic> code generation performance for each included 3-character cardiorespiratory category in the independent test dataset<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2"><italic>ICD-10-CM</italic><sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> category</td><td align="left" valign="bottom" colspan="3">Llama 3.3-70B</td><td align="left" valign="bottom" colspan="3">gpt-oss-120B</td></tr><tr><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">R00</td><td align="left" valign="top">0.42</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.75</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.86</td></tr><tr><td align="left" valign="top">R01</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.50</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.67</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.80</td></tr><tr><td align="left" valign="top">R03</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.29</td><td align="left" valign="top">0.33</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.73</td></tr><tr><td align="left" valign="top">R04</td><td align="left" valign="top">0.73</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.80</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.89</td><td align="left" valign="top">0.94</td></tr><tr><td align="left" valign="top">R05</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.75</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td></tr><tr><td align="left" valign="top">R06</td><td align="left" valign="top">0.70</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.96</td></tr><tr><td align="left" valign="top">R07</td><td align="left" valign="top">0.53</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.77</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.87</td></tr><tr><td align="left" valign="top">R09</td><td align="left" valign="top">0.47</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.58</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.71</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated separately for R00, R01, R03, R04, R05, R06, R07, and R09 for Llama 3.3-70B and gpt-oss-120B to assess variation in performance across individual cardiorespiratory code groups.</p></fn><fn id="table5fn2"><p><sup>b</sup><italic>ICD-10-CM</italic>: <italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>We evaluated the ability of Llama 3.3-70B and gpt-oss-120B to extract S&#x0026;S and map them to corresponding <italic>ICD-10-CM</italic> codes. All <italic>ICD-10-CM</italic> codes were strictly limited to the range R00 to R09 during the manual coding process. For this reason, providing the <italic>ICD-10-CM</italic> codes to the model led to better performance in the <italic>ICD-10-CM</italic> generation task compared to the S&#x0026;S extraction task. Among the different prompting strategies tested, the multimodule LLMs with postdata cleaning achieved the highest performance across both tasks.</p><p>Initially, we only provided basic task instructions to test whether the general-purpose model possessed sufficient medical knowledge from its training data and whether it could correctly interpret medical terminology. The results showed that the model performed well in extracting medical terms. <xref ref-type="table" rid="table6">Table 6</xref> presents sample output from 4 different prompt-engineering strategies. While the model was able to identify some S&#x0026;S terms, it struggled to accurately recognize those related specifically to the cardiorespiratory system. It often failed to distinguish S&#x0026;S from other medical terms and sometimes misclassified conditions as S&#x0026;S. For example, the model incorrectly identified &#x201C;peripheral vascular disease&#x201D; as a symptom or sign, even though it is a diagnosis. Additionally, it mislabeled terms from other body systems, such as classifying &#x201C;trace edema at dorsum of feet and ankles&#x201D; as a cardiorespiratory symptom [<xref ref-type="bibr" rid="ref14">14</xref>].</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Representative outputs generated by the large language model (LLM) under 4 prompt-engineering conditions (instruction-only, <italic>ICD-10-CM</italic> definition-based, assumption-free constraints, and multimodule LLM with postprocessing) for a single deidentified clinical note containing cardiorespiratory symptoms<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Prompt setting</td><td align="left" valign="bottom">Sample output</td></tr></thead><tbody><tr><td align="left" valign="top">Instruction only</td><td align="left" valign="top">&#x201C;Here are the extracted symptoms and signs involving the circulatory and respiratory systems:<break/>**Circulatory System:**<list list-type="order"><list-item><p>Ventricular ectopy (I47.1) - PVCs<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> random, run off three beats consecutive</p></list-item><list-item><p>History of coronary artery disease (I25.10)</p></list-item><list-item><p>Peripheral vascular disease (I73.9)</p></list-item><list-item><p>Hypertension (I10)</p></list-item></list><break/>**Respiratory System:**<list list-type="order"><list-item><p>Cough (R05)</p></list-item><list-item><p>Shortness of breath (R06.00)</p></list-item><list-item><p>Decreased air entry (R09.02)</p></list-item><list-item><p>COPD<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup> (J44.9)</p></list-item></list><break/>Note that I excluded denied or negative symptoms and signs, as per the requirement. Also, some codes may have additional characters to specify laterality or other details, but I provided the most relevant code based on the information given.&#x201D;</td></tr><tr><td align="left" valign="top"><italic>ICD-10-CM</italic><sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup> code-based symptom extraction</td><td align="left" valign="top">&#x201C;Here is the list of symptoms and signs involving the circulatory and respiratory systems along with their ICD-10 codes:<break/>* Cough - R05<break/>* Shortness of breath - R06.0 (Dyspnea)<break/>* PVCs (Ventricular ectopy) - R00.1 (Bradycardia, unspecified is not correct as it&#x2019;s actually ventricular ectopy which falls under other abnormalities of heart beat) - R00.8<break/>* Tachycardia - R00.0 (The patient has a heart rate of 90)<break/>* Elevated blood-pressure reading - R03.0<break/>Note: The ICD-10 codes are based on the provided list and may not be comprehensive or entirely accurate without further context or information.&#x201D;</td></tr><tr><td align="left" valign="top">Assumption-free constraints</td><td align="left" valign="top">&#x201C;Based on the clinical note, the patient&#x2019;s current presented symptoms and signs involving the circulatory and respiratory systems are:<list list-type="order"><list-item><p>Cough (R05)</p></list-item><list-item><p>Shortness of breath (R06.0 Dyspnea)</p></list-item></list><break/>The following ICD-10 codes can be assigned:<list list-type="endash"><list-item><p>R05 (Cough)</p></list-item><list-item><p>R06.0 (Dyspnea)</p></list-item></list><break/>Note: PVCs (Premature Ventricular Contractions) are mentioned in the note, but they do not have a direct ICD-10 code match from the provided list. However, it is related to the cardiovascular system and can be associated with abnormalities of heart beat, but since there&#x2019;s no explicit mention of symptoms like palpitations or tachycardia that directly correlate with the PVCs as experienced by the patient, we cannot include R00 codes based on the given instructions.&#x201D;</td></tr><tr><td align="left" valign="top" colspan="2">Multiodule LLMs with post data cleaning</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><break/>Extraction module</td><td align="left" valign="top">&#x201C;Based on the provided definitions of symptom and sign, here are the patient&#x2019;s current presented symptoms and signs extracted from the clinical note:<break/>**Symptoms:**<list list-type="order"><list-item><p>Coughing episodes for about two weeks on and off</p></list-item><list-item><p>Shortness of breath</p></list-item><list-item><p>Neck pain</p></list-item><list-item><p>Arthritis (patient is positive for arthritis)</p></list-item></list><break/>**Signs:**<list list-type="order"><list-item><p>PVCs (premature ventricular contractions) random on monitor</p></list-item><list-item><p>Decreased air entry in lungs</p></list-item><list-item><p>S4 at the apex (heart sound)</p></list-item><list-item><p>Cough (mentioned under REVIEW OF SYSTEMS: RESPIRATORY)</p></list-item></list><break/>Note that I have excluded:<break/>* Denied or negative symptoms and signs (e.g., no chest pain, no palpitations, etc.)<break/>* Signs with numerical data (e.g., heart rate of 90, blood pressure of 187/105, etc.) as per the requirement.&#x201D;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Refinement module</td><td align="left" valign="top">R05: Coughing episodes for about two weeks on and off,<break/>R06.0: Shortness of breath,<break/>R00.1: PVCs (premature ventricular contractions),<break/>R01.1: S4 at the apex (heart sound),<break/>R06.8: Decreased air entry in lungs</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>The examples illustrate differences in hallucination, inference, symptom specificity, and <italic>ICD-10-CM </italic>mapping accuracy. <italic>ICD-10-CM</italic> code evaluation in this study was performed at the 3-character category level; therefore, subcode-level differences were not counted as errors.</p></fn><fn id="table6fn2"><p><sup>b</sup>PVC: premature ventricular contraction.</p></fn><fn id="table6fn3"><p><sup>c</sup>COPD: chronic obstructive pulmonary disease.</p></fn><fn id="table6fn4"><p><sup>d</sup><italic>ICD-10-CM</italic>: <italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic>.</p></fn></table-wrap-foot></table-wrap><p>The Llama model could generate <italic>ICD-10-CM</italic> codes, suggesting that the coding system was included in its training data by Meta. However, hallucination remained a concern, as the model occasionally produced nonexistent codes. Moreover, some medical terms lacked clear boundaries between symptoms, signs, and diagnoses. In the <italic>ICD-10-CM</italic> system, the R00-R99 range is designated for S&#x0026;S, but certain valid S&#x0026;S still had to be mapped to codes outside this range. Due to our labeling strategy, which only included S&#x0026;S mapped to R00-R09, the model&#x2019;s performance appeared limited. Nonetheless, the model demonstrated its ability to extract medical concepts and associate them with <italic>ICD-10-CM</italic> codes, warranting further investigation.</p><p>We then provided a list of <italic>ICD-10-CM</italic> codes and their definitions related to the cardiorespiratory system with the prompt designed to minimize hallucinations and improve the model&#x2019;s ability to identify relevant S&#x0026;S. Model performance improved significantly and was able to extract most cardiorespiratory-related S&#x0026;S accurately. Furthermore, the model exhibited the ability to draw inferences based on clinical context, which contributed to high recall. However, this introduced new challenges, including overinterpretation and assumption-based errors, resulting in lower precision. For instance, in one note that mentioned &#x201C;congested respirations,&#x201D; the model inferred and generated &#x201C;cough&#x201D; as a symptom, justifying it with the interpretation: &#x201C;congested respirations and mild crackles are present, which can be related to cough (R05).&#x201D; This example illustrates the model&#x2019;s tendency to infer S&#x0026;S based on contextual associations rather than relying solely on explicitly stated information, resulting in false positives.</p><p>To reduce overinterpretation, we refined our third prompt to instruct the model to extract only explicitly stated S&#x0026;S. This adjustment improved precision, as a higher proportion of the identified terms was correct, while recall declined only slightly.</p><p>In the fourth prompt, we enforced a standardized output format that included only the extracted S&#x0026;S along with their corresponding <italic>ICD-10-CM</italic> codes. This structured format was designed to support an automated evaluation pipeline. However, imposing all constraints within a single model introduced complexity. The model was expected to complete 6 distinct tasks simultaneously: extract S&#x0026;S, restrict to the cardiorespiratory system, map to <italic>ICD-10-CM</italic> codes, exclude negated S&#x0026;S, include only explicitly mentioned ones, and format the output accordingly. Relying on a single model led to occasional errors, such as missing explicitly mentioned S&#x0026;S or including inferred ones, thereby reducing overall accuracy.</p><p>The multimodule LLM approach addressed this issue by distributing subtasks across specialized models, leading to more stable and accurate outputs. Additionally, by including definitions of S&#x0026;S within the prompt, the model gained a clearer understanding of the extraction task. However, through further testing, we observed that the model often misclassified numerical examination results, such as blood pressure readings, as S&#x0026;S. To address this, we explicitly instructed the model to exclude examination findings with numerical data. This comprehensive strategy resulted in the highest performance.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Prior work on clinical text analysis has largely focused on downstream prediction rather than explicit symptom extraction. Transformer-based models such as ClinicalBERT and Med-BERT have demonstrated strong performance in modeling clinical notes for outcomes such as hospital readmission and disease risk prediction; however, they do not explicitly extract S&#x0026;S or map them to standardized coding systems such as <italic>ICD-10-CM</italic> [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Applying traditional machine learning models to our task would require large amounts of labeled training data for both named entity recognition and entity linking, substantially increasing data requirements, time investment, and labor burden.</p><p>More recent studies have applied LLMs to structured information extraction using prompt engineering. For example, prompt-based GPT approaches have been used to extract clinical factors from pathology or radiology reports with high accuracy and improved efficiency compared with manual abstraction [<xref ref-type="bibr" rid="ref9">9</xref>]. However, these studies typically relied on proprietary, cloud-based models and did not systematically address hallucination or over-inference when processing narrative clinical notes. Hybrid retrieval-augmented generation pipelines have also been proposed to improve precision in extracting specific clinical attributes, such as substance use, by constraining model input to relevant text segments [<xref ref-type="bibr" rid="ref13">13</xref>]. While effective, these approaches introduce additional system complexity and external dependencies.</p><p>In contrast, our study demonstrates that a locally deployed, open-source LLM, combined with assumption-free prompting and a multimodule framework, can accurately extract explicitly stated cardiorespiratory S&#x0026;S and map them to <italic>ICD-10-CM</italic> codes. This work extends prior research by emphasizing hallucination control, standardized coding, and privacy-preserving deployment, highlighting the feasibility of open-source LLMs for reliable clinical symptom extraction. Future model development should adhere to principles of responsible artificial intelligence apps in medicine [<xref ref-type="bibr" rid="ref24">24</xref>]. Accurate extraction of symptoms from clinical notes can facilitate artificial intelligence&#x2013;assisted clinical decision support embedded in electronic health records [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec><sec id="s4-3"><title>Limitations and Future Directions</title><p>Although the multimodule framework with postprocessing achieved the best overall performance, it did not fully eliminate subcode-level imprecision in <italic>ICD-10-CM</italic> mapping. For example, in <xref ref-type="table" rid="table6">Table 6</xref>, the refinement module mapped premature ventricular contractions to R00.1, which denotes unspecified bradycardia and is clinically discordant with premature ventricular contractions. This, therefore, represents a clinically incorrect mapping error. However, because our evaluation considered only the first 3 digits of the <italic>ICD-10-CM</italic> code, this output was treated as a correct R00 category-level mapping (abnormalities of heartbeat). Future work should improve subcode-level specificity and clinical precision in <italic>ICD-10-CM</italic> assignment.</p><p>The red-flag keyword filter used during postprocessing was intentionally conservative and improved overall performance, but it was also a coarse rule-based step. For example, because the keyword list included terms such as &#x201C;explicitly,&#x201D; valid outputs could theoretically be removed if the model echoed instruction-like language in its structured response. Although postprocessing improved overall <italic>F</italic><sub>1</sub>-scores, this approach may have increased precision at the cost of removing some true positive outputs and should be refined in future work.</p><p>In addition, the scope of this study was restricted to cardiorespiratory S&#x0026;S, which limits generalizability to other clinical domains. Future work should expand the dataset, incorporate a broader range of symptom domains, and further refine prompt engineering and multimodule orchestration strategies to improve robustness and adaptability across diverse biomedical contexts.</p><p>This study did not systematically evaluate model-specific runtime options, such as reasoning effort levels or structured output enforcement, because the primary objective was to assess the effects of prompt design and workflow structure under a fixed inference setting. We also did not compare our framework against proprietary frontier models, such as GPT-4. Although such models could provide an additional reference benchmark, they would not represent a fixed upper performance bound because extraction accuracy depends on factors such as model version, prompting strategy, runtime configuration, and evaluation design. In addition, this study was intentionally focused on privacy-preserving, locally deployable workflows for clinical text processing. Future work should investigate whether runtime configuration options further improve output stability and extraction accuracy and should include direct comparisons between open-weight local models and high-performing closed commercial models under standardized prompts and evaluation settings.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In this study, Llama 3.3-70B was used primarily for prompt development to support the extraction of clinical S&#x0026;S and their mapping to <italic>ICD-10-CM</italic> codes. We then evaluated the final multimodule LLM framework with postprocessing on an independent test set of 500 clinical notes using both Llama 3.3-70B and gpt-oss-120B. Under the entity-level microaveraged evaluation across the full corpus, Llama 3.3-70B achieved <italic>F</italic><sub>1</sub>-scores of 0.73 for S&#x0026;S extraction and 0.69 for <italic>ICD-10-CM</italic> code generation, whereas gpt-oss-120B achieved <italic>F</italic><sub>1</sub>-scores of 0.88 and 0.87, respectively. In note-level macroaveraged evaluation, performance was higher, with Llama 3.3-70B achieving <italic>F</italic><sub>1</sub>-scores of 0.95 for both S&#x0026;S extraction and <italic>ICD-10-CM</italic> mapping and gpt-oss-120B achieving <italic>F</italic><sub>1</sub>-scores of 0.97 and 0.98. Overall, the framework demonstrates the feasibility of locally deployable LLMs for structured extraction of cardiorespiratory S&#x0026;S from clinical notes and may increase the level of privacy safety by allowing on-premises processing without external data transmission.</p></sec></sec></body><back><ack><p>The authors used the generative artificial intelligence tool Llama 3.3-70B by Meta and gpt-oss by OpenAI to extract symptoms and signs from clinical notes. ChatGPT by OpenAI was used for grammar correction during manuscript generation.</p></ack><notes><sec><title>Funding</title><p>The project was funded in part by National Institutes of Health grant R33HL143317.</p></sec><sec><title>Data Availability</title><p>The datasets analyzed during this study are available in the MTSamples database [<xref ref-type="bibr" rid="ref16">16</xref>]. The labeled data in our study are available from the corresponding author on reasonable request. The code can be accessed at [<xref ref-type="bibr" rid="ref27">27</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: YB, JF</p><p>Data curation: YB, WC, JF</p><p>Formal analysis: YB</p><p>Funding acquisition: JF</p><p>Investigation: YB</p><p>Methodology: YB</p><p>Resources: WC, JF</p><p>Software: YB</p><p>Supervision: JF</p><p>Validation: YB, WC, JF</p><p>Visualization: YB</p><p>Writing &#x2013; original draft: YB</p><p>Writing &#x2013; review &#x0026; editing: WC, JF</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">COPD</term><def><p>chronic obstructive pulmonary disease</p></def></def-item><def-item><term id="abb2">EM</term><def><p>extraction module</p></def></def-item><def-item><term id="abb3"><italic>ICD-10-CM</italic></term><def><p><italic>International Classification of Diseases, Tenth Revision, Clinical Modification</italic></p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb6">RM</term><def><p>refinement module</p></def></def-item><def-item><term id="abb7">S&#x0026;S</term><def><p>symptom and sign</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hamilton</surname><given-names>W</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Round</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sharp</surname><given-names>D</given-names> </name></person-group><article-title>What are the clinical features of lung cancer before the diagnosis is made? A population based case-control study</article-title><source>Thorax</source><year>2005</year><month>12</month><volume>60</volume><issue>12</issue><fpage>1059</fpage><lpage>1065</lpage><pub-id pub-id-type="doi">10.1136/thx.2005.045880</pub-id><pub-id pub-id-type="medline">16227326</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prado</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Kessler</surname><given-names>LG</given-names> </name><name name-style="western"><surname>Au</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>Symptoms and signs of lung cancer prior to diagnosis: case-control study using electronic health records from ambulatory care within a large US-based tertiary care centre</article-title><source>BMJ Open</source><year>2023</year><month>04</month><day>20</day><volume>13</volume><issue>4</issue><fpage>e068832</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2022-068832</pub-id><pub-id pub-id-type="medline">37080616</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rivera</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Mehta</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Wahidi</surname><given-names>MM</given-names> </name></person-group><article-title>Establishing the diagnosis of lung cancer: diagnosis and management of lung cancer, 3rd ed: American College of Chest Physicians evidence-based clinical practice guidelines</article-title><source>Chest</source><year>2013</year><month>05</month><volume>143</volume><issue>5 Suppl</issue><fpage>e142S</fpage><lpage>e165S</lpage><pub-id pub-id-type="doi">10.1378/chest.12-2353</pub-id><pub-id pub-id-type="medline">23649436</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bahadori</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Kulas</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Schuetz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stewart</surname><given-names>WF</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>RETAIN: an interpretable predictive model for healthcare using reverse time attention mechanism</article-title><access-date>2026-05-07</access-date><conf-name>Advances in Neural Information Processing Systems (NeurIPS)</conf-name><conf-date>Dec 5-10, 2016</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://papers.neurips.cc/paper/2016/file/231141b34c82aa95e48810a9d1b33a79-Paper.pdf">https://papers.neurips.cc/paper/2016/file/231141b34c82aa95e48810a9d1b33a79-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah-Mohammadi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name></person-group><article-title>NLP-assisted differential diagnosis of chronic obstructive pulmonary disease exacerbation</article-title><source>Stud Health Technol Inform</source><year>2024</year><month>01</month><day>25</day><volume>310</volume><fpage>589</fpage><lpage>593</lpage><pub-id pub-id-type="doi">10.3233/SHTI231033</pub-id><pub-id pub-id-type="medline">38269877</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Altosaar</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ranganath</surname><given-names>R</given-names> </name></person-group><article-title>ClinicalBERT: modeling clinical notes and predicting hospital readmission</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 10, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1904.05342</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rasmy</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xiang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Tao</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhi</surname><given-names>D</given-names> </name></person-group><article-title>Med-BERT: pretrained contextualized embeddings on large-scale structured electronic health records for disease prediction</article-title><source>NPJ Digit Med</source><year>2021</year><month>05</month><day>20</day><volume>4</volume><issue>1</issue><fpage>86</fpage><pub-id pub-id-type="doi">10.1038/s41746-021-00455-y</pub-id><pub-id pub-id-type="medline">34017034</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Solares</surname><given-names>JRA</given-names> </name><etal/></person-group><article-title>BEHRT: transformer for electronic health records</article-title><source>Sci Rep</source><year>2020</year><volume>10</volume><issue>1</issue><fpage>7155</fpage><pub-id pub-id-type="doi">10.1038/s41598-020-62922-y</pub-id><pub-id pub-id-type="medline">32346050</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Song</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>BS</given-names> </name></person-group><article-title>Developing prompts from large language model for extracting clinical information from pathology and ultrasound reports in breast cancer</article-title><source>Radiat Oncol J</source><year>2023</year><month>09</month><volume>41</volume><issue>3</issue><fpage>209</fpage><lpage>216</lpage><pub-id pub-id-type="doi">10.3857/roj.2023.00633</pub-id><pub-id pub-id-type="medline">37793630</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah-Mohammadi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name></person-group><article-title>Extraction of substance use information from clinical notes: generative pretrained transformer-based investigation</article-title><source>JMIR Med Inform</source><year>2024</year><month>08</month><day>19</day><volume>12</volume><fpage>e56243</fpage><pub-id pub-id-type="doi">10.2196/56243</pub-id><pub-id pub-id-type="medline">39037700</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mamouei</surname><given-names>M</given-names> </name><name name-style="western"><surname>Salimi-Khorshidi</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Hi-BEHRT: hierarchical transformer-based model for accurate prediction of clinical events using multimodal longitudinal electronic health records</article-title><source>IEEE J Biomed Health Inform</source><year>2023</year><volume>27</volume><issue>2</issue><fpage>1106</fpage><lpage>1117</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2022.3224727</pub-id><pub-id pub-id-type="medline">36427286</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Long</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>M</given-names> </name><etal/></person-group><article-title>BioRAG: a RAG-LLM framework for biological question reasoning</article-title><source>arXiv</source><comment>Preprint posted online on  August 2, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.01107</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah-Mohammadi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name></person-group><article-title>Utilizing RAG and GPT-4 for extraction of substance use information from clinical notes</article-title><source>Stud Health Technol Inform</source><year>2024</year><month>11</month><day>22</day><volume>321</volume><fpage>94</fpage><lpage>98</lpage><pub-id pub-id-type="doi">10.3233/SHTI241070</pub-id><pub-id pub-id-type="medline">39575787</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>W</given-names> </name><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name></person-group><article-title>Performance of open-source large language models to extract symptoms from clinical notes</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>08</month><day>7</day><volume>329</volume><fpage>663</fpage><lpage>667</lpage><pub-id pub-id-type="doi">10.3233/SHTI250923</pub-id><pub-id pub-id-type="medline">40775941</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>W</given-names> </name><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name></person-group><article-title>Utilizing open-source large language models to extract genitourinary symptoms from clinical notes</article-title><source>Stud Health Technol Inform</source><year>2025</year><month>06</month><day>26</day><volume>328</volume><fpage>16</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.3233/SHTI250664</pub-id><pub-id pub-id-type="medline">40588872</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><source>MTSamples.com</source><access-date>2026-05-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mtsamples.com/">https://www.mtsamples.com/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Symptoms and signs involving the circulatory and respiratory systems R00-R09</article-title><source>ICD10Data</source><access-date>2026-05-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.icd10data.com/ICD10CM/Codes/R00-R99/R00-R09">https://www.icd10data.com/ICD10CM/Codes/R00-R99/R00-R09</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Llama3.3</article-title><source>Ollama</source><year>2024</year><access-date>2026-05-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ollama.com/library/llama3.3">https://ollama.com/library/llama3.3</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ahmad</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Applebaum</surname><given-names>A</given-names> </name><name name-style="western"><surname>Arbus</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Gpt-oss-120b &#x0026; gpt-oss-20b model card</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 8, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2508.10925</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Bansal</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>AutoGen: enabling next-gen LLM applications via multi-agent conversation</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 16, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.08155</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hammoud</surname><given-names>HAK</given-names> </name><name name-style="western"><surname>Itani</surname><given-names>H</given-names> </name><name name-style="western"><surname>Khizbullin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ghanem</surname><given-names>B</given-names> </name></person-group><article-title>CAMEL: communicative agents for &#x201C;mind&#x201D; exploration of large language model society</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 31, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.17760</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhuge</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>MetaGPT: meta programming for a multi-agent collaborative framework</article-title><access-date>2026-05-07</access-date><conf-name>The Twelfth International Conference on Learning Representations (ICLR 2024)</conf-name><conf-date>May 7-11, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=VtmBAGCN7o">https://openreview.net/forum?id=VtmBAGCN7o</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pei</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chawla</surname><given-names>NV</given-names> </name><etal/></person-group><article-title>Large language model based multi-agents: a survey of progress and challenges</article-title><conf-name>Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence (IJCAI-24)</conf-name><conf-date>Aug 3-9, 2024</conf-date><conf-loc>Jeju Island, South Korea</conf-loc><fpage>8048</fpage><lpage>8057</lpage><pub-id pub-id-type="doi">10.24963/ijcai.2024/890</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smiley</surname><given-names>A</given-names> </name><name name-style="western"><surname>Reategui-Rivera</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Villarreal-Zegarra</surname><given-names>D</given-names> </name><name name-style="western"><surname>Escobar-Agreda</surname><given-names>S</given-names> </name><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name></person-group><article-title>Exploring artificial intelligence biases in predictive models for cancer diagnosis</article-title><source>Cancers (Basel)</source><year>2025</year><month>01</month><day>26</day><volume>17</volume><issue>3</issue><fpage>407</fpage><pub-id pub-id-type="doi">10.3390/cancers17030407</pub-id><pub-id pub-id-type="medline">39941776</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gabriel</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schmer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Truong</surname><given-names>TT</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>A</given-names> </name></person-group><article-title>Identifying facilitators and barriers to implementation of AI-assisted clinical decision support in an electronic health record system</article-title><source>J Med Syst</source><year>2024</year><month>09</month><day>18</day><volume>48</volume><issue>1</issue><fpage>89</fpage><pub-id pub-id-type="doi">10.1007/s10916-024-02104-9</pub-id><pub-id pub-id-type="medline">39292314</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kawamoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Finkelstein</surname><given-names>J</given-names> </name><name name-style="western"><surname>Del Fiol</surname><given-names>G</given-names> </name></person-group><article-title>Implementing machine learning in the electronic health record: checklist of essential considerations</article-title><source>Mayo Clin Proc</source><year>2023</year><month>03</month><volume>98</volume><issue>3</issue><fpage>366</fpage><lpage>369</lpage><pub-id pub-id-type="doi">10.1016/j.mayocp.2023.01.013</pub-id><pub-id pub-id-type="medline">36868743</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Resources</article-title><source>Arizona Telemedicine Program</source><access-date>2026-05-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://telemedicine.arizona.edu/resources">https://telemedicine.arizona.edu/resources</ext-link></comment></nlm-citation></ref></ref-list></back></article>