<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v4i1e67239</article-id>
      <article-id pub-id-type="pmid"/>
      <article-id pub-id-type="doi">10.2196/67239</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Improving the Robustness and Clinical Applicability of Automatic Respiratory Sound Classification Using Deep Learning–Based Audio Enhancement: Algorithm Development and Validation</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Luo</surname>
            <given-names>Gang</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Abd El-Hafeez</surname>
            <given-names>Tarek</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Meedeniya</surname>
            <given-names>Dulani</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Tzeng</surname>
            <given-names>Jing-Tong</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2053-0581</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Jeng-Lin</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9261-1524</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Huan-Yu</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0055-8034</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Chun-Hsiang</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3097-153X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Chi-Hsin</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1826-6598</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Fan</surname>
            <given-names>Cheng-Yi</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1585-0038</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Edward Pei-Chuan</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4800-2561</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>Chi-Chun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Department of Electrical Engineering</institution>
            <institution>National Tsing Hua University</institution>
            <addr-line>101, Section 2, Kuang-Fu Road</addr-line>
            <addr-line>Hsinchu, 300</addr-line>
            <country>Taiwan</country>
            <phone>886 35162439</phone>
            <email>cclee@ee.nthu.edu.tw</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0186-4321</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>College of Semiconductor Research</institution>
        <institution>National Tsing Hua University</institution>
        <addr-line>Hsinchu</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Electrical Engineering</institution>
        <institution>National Tsing Hua University</institution>
        <addr-line>Hsinchu</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Emergency Medicine</institution>
        <institution>National Taiwan University Hospital Hsin-Chu Branch</institution>
        <addr-line>Hsinchu</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Emergency Medicine</institution>
        <institution>National Taiwan University Hospital</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Chi-Chun Lee <email>cclee@ee.nthu.edu.tw</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>3</month>
        <year>2025</year>
      </pub-date>
      <volume>4</volume>
      <elocation-id>e67239</elocation-id>
      <history>
        <date date-type="received">
          <day>6</day>
          <month>10</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>3</day>
          <month>12</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>26</day>
          <month>1</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>27</day>
          <month>1</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Jing-Tong Tzeng, Jeng-Lin Li, Huan-Yu Chen, Chun-Hsiang Huang, Chi-Hsin Chen, Cheng-Yi Fan, Edward Pei-Chuan Huang, Chi-Chun Lee. Originally published in JMIR AI (https://ai.jmir.org), 13.03.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2025/1/e67239" xlink:type="simple"/>
      <related-article related-article-type="correction-forward" xlink:title="This is a corrected version. See correction statement in:" xlink:href="https://ai.jmir.org/2025/1/e76150" vol="4" page="e76150"> </related-article>

      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Deep learning techniques have shown promising results in the automatic classification of respiratory sounds. However, accurately distinguishing these sounds in real-world noisy conditions poses challenges for clinical deployment. In addition, predicting signals with only background noise could undermine user trust in the system.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to investigate the feasibility and effectiveness of incorporating a deep learning–based audio enhancement preprocessing step into automatic respiratory sound classification systems to improve robustness and clinical applicability.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We conducted extensive experiments using various audio enhancement model architectures, including time-domain and time-frequency–domain approaches, in combination with multiple classification models to evaluate the effectiveness of the audio enhancement module in an automatic respiratory sound classification system. The classification performance was compared against the baseline noise injection data augmentation method. These experiments were carried out on 2 datasets: the International Conference in Biomedical and Health Informatics (ICBHI) respiratory sound dataset, which contains 5.5 hours of recordings, and the Formosa Archive of Breath Sound dataset, which comprises 14.6 hours of recordings. Furthermore, a physician validation study involving 7 senior physicians was conducted to assess the clinical utility of the system.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The integration of the audio enhancement module resulted in a 21.88% increase with <italic>P</italic>&lt;.001 in the ICBHI classification score on the ICBHI dataset and a 4.1% improvement with <italic>P</italic>&lt;.001 on the Formosa Archive of Breath Sound dataset in multi-class noisy scenarios. Quantitative analysis from the physician validation study revealed improvements in efficiency, diagnostic confidence, and trust during model-assisted diagnosis, with workflows that integrated enhanced audio leading to an 11.61% increase in diagnostic sensitivity and facilitating high-confidence diagnoses.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Incorporating an audio enhancement algorithm significantly enhances the robustness and clinical utility of automatic respiratory sound classification systems, improving performance in noisy environments and fostering greater trust among medical professionals.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>respiratory sound</kwd>
        <kwd>lung sound</kwd>
        <kwd>audio enhancement</kwd>
        <kwd>noise robustness</kwd>
        <kwd>clinical applicability</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Respiratory sounds play a crucial role in pulmonary pathology. They provide insights into the condition of the lungs noninvasively and assist disease diagnosis through specific sound patterns and characteristics [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. For instance, wheezing is a continuous high-frequency sound that often indicates typical symptoms of chronic obstructive pulmonary disease and asthma [<xref ref-type="bibr" rid="ref3">3</xref>]; crackling, on the other hand, is an intermittent low-frequency sound with a shorter duration that is a common respiratory sound feature among patients with lung infections [<xref ref-type="bibr" rid="ref4">4</xref>]. The advancement of machine learning algorithms and medical devices enables researchers to investigate approaches for developing automated respiratory sound classification systems, reducing the reliance on manual inputs from physicians and medical professionals.</p>
        <p>In earlier studies, researchers have engineered handcrafted audio features for respiratory sound classification [<xref ref-type="bibr" rid="ref5">5</xref>]. Recently, neural network–based methods have become the de facto methods for lung sound classification. For example, Kim et al [<xref ref-type="bibr" rid="ref6">6</xref>] fine-tuned the pretrained VGG16 algorithm, outperforming the conventional support vector machine (SVM) classifier. Wanasinghe et al [<xref ref-type="bibr" rid="ref7">7</xref>] incorporated mel spectrograms, mel-frequency cepstral coefficients, and chroma features to expand the feature set input to a convolutional neural network (CNN), demonstrating promising results in the identification of pulmonary diseases. Pessoa et al [<xref ref-type="bibr" rid="ref8">8</xref>] proposed a hybrid CNN model architecture that integrates time-domain information with spectrogram-based features, delivering a satisfactory performance. Moreover, various advanced architectures have been proposed to extract both long-term and short-term information from respiratory sounds based on the characteristics of crackle and wheeze sounds and have shown enhanced performance [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Recent works have used advanced contrastive learning strategies to enhance intraclass compactness and interclass separability for further improvements [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. These advancements in neural network structures have shown increasing promise in achieving reliable respiratory sound classification.</p>
        <p>Despite these advancements, significant challenges remain for the clinical deployment of automatic respiratory sound classification systems due to complex real-world noisy conditions [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Augmentation techniques, such as time shifting, speed tuning, and noise injection, have been key strategies to effectively improve the noise robustness and generalizability of a machine learning model [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. While these approaches have shown promising results in respiratory sound classification tasks, their practical utility as modules for building clinical decision support systems remains in doubt. This is primarily attributed to their inability to provide clinicians with intelligible raw audio to listen to facilitate decision-making, thus making the current augmentation-based approach seem black box and hindering acceptance and adoption by medical professionals.</p>
        <p>In fact, given the blooming use of artificial intelligence (AI) in health care, the issue of liability has been the focus. The prevailing public opinion suggests that physicians are the ones to bear responsibility for errors attributed to AI [<xref ref-type="bibr" rid="ref24">24</xref>]. Hence, when these systems are opaque and inaccessible to physicians, it becomes challenging to have them assume responsibility without a clear understanding of the decision-making process. This difficulty is particularly pronounced for seasoned and senior physicians, who hesitate to endorse AI recommendations without transparent rationale. The resulting lack of trust contributes to conflicts in clinical applications. Therefore, elucidating the decision-making process is crucial to establishing the trust of physicians [<xref ref-type="bibr" rid="ref25">25</xref>]. Moreover, exceptions are frequent in the field of medicine. For instance, in cases in which bronchioles undergo significant constriction, the wheezing sound may diminish to near silence, a phenomenon referred to as silent wheezing. This intricacy could confound AI systems, necessitating human intervention (ie, listening directly to the recorded audio) [<xref ref-type="bibr" rid="ref26">26</xref>].</p>
        <p>To address these challenges, we propose an approach that involves integrating an audio enhancement module into the respiratory sound classification system, as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. This module aims to achieve noise-robust respiratory sound classification performance while providing clean audio recordings on file to support physicians’ decision-making. By enhancing the audio quality and preserving critical information, our system aimed to facilitate more accurate assessments and foster trust among medical professionals. Specifically, we devised 2 major experiments to evaluate this approach in this study. First, we compared the performance of our noise-robust system through audio enhancement to the conventional method of noise augmentation (noise injection) under various clinical noise conditions and signal-to-noise ratios (SNRs). Second, we conducted a physician validation study to assess confidence and reliability when listening to our cleaned audio for respiratory sound class identification. To the best of our knowledge, this is the first study showing that deep learning enhancement architecture can effectively remove noise while preserving discriminative information for respiratory sound classification algorithms and physicians. Importantly, this study validates the clinical potential and practicality of our proposed audio enhancement front-end module, contributing to more robust respiratory sound classification systems and aiding physicians in making accurate and reliable assessments.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>An overview of our proposed noise-robust respiratory sound classification system with audio enhancement. CNN: convolutional neural network; CNN14: 14-layer CNN; conformer: convolution-augmented transformer; ISTFT: inverse short-time Fourier transform; STFT: short-time Fourier transform; TS: 2 stage.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e67239_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Related Work</title>
        <sec>
          <title>Audio Enhancement</title>
          <p>Audio enhancement is a technique that has been widely used in the speech domain, where it is referred to as speech enhancement. These techniques are primarily used in the front-end stage of automatic speech recognition systems to improve intelligibility [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. Within speech enhancement, deep neural network approaches can be categorized into 2 main domains: time-frequency–domain approaches and time-domain approaches.</p>
          <p>Time-frequency–domain approaches are used to estimate clean audio from the short-time Fourier transform (STFT) spectrogram, which provides both time and frequency information. Kumar and Florencio [<xref ref-type="bibr" rid="ref30">30</xref>] leveraged noise-aware training [<xref ref-type="bibr" rid="ref31">31</xref>] with psychoacoustic models, which decided the importance of frequency for speech enhancement. The result demonstrated the potential of deep neural network–based speech enhancement in complex multiple-noise conditions, such as real-world environments. In the research by Yin et al [<xref ref-type="bibr" rid="ref32">32</xref>], they designed a 2-stream architecture that predicts amplitude and phase separately and further improves the performance. However, various research studies [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref35">35</xref>] have indicated that the conventional loss functions used in regression models (eg, L<sub>1</sub> and L<sub>2</sub>) do not strongly correlate with speech quality, intelligibility, and word error rate. To address the issue of discriminator evaluation mismatch, Fu et al [<xref ref-type="bibr" rid="ref36">36</xref>] introduced MetricGAN. This approach tackles the problem of metrics that are not entirely aligned with the discriminator’s way of distinguishing between real and fake samples. They used perceptual evaluation of speech quality (PESQ) [<xref ref-type="bibr" rid="ref37">37</xref>] and short-time objective intelligibility (STOI) [<xref ref-type="bibr" rid="ref38">38</xref>] as evaluation functions, which are commonly used for assessing speech quality and intelligibility, as labels for the discriminator. Furthermore, the performance of MetricGAN can be enhanced by adding a learnable sigmoid function for mask estimation, including noisy recording for discriminator training, and using a replay buffer to increase sample size [<xref ref-type="bibr" rid="ref39">39</xref>]. Recently, convolution-augmented transformers (conformers) have been widely used in automatic speech recognition and speech separation tasks due to their capacity in long-range and local contexts [<xref ref-type="bibr" rid="ref40">40</xref>-<xref ref-type="bibr" rid="ref42">42</xref>]. Cao et al [<xref ref-type="bibr" rid="ref43">43</xref>] introduced a conformer-based metric generative adversarial network (CMGAN), which leverages the conformer structure along with MetricGAN for speech enhancement. In the CMGAN model, multiple 2-stage conformers are used to aggregate magnitude and complex spectrogram information in the encoder. In the decoder, the prediction of the magnitude and complex spectrogram are decoupled and then jointly incorporated to reconstruct the enhanced recordings. Furthermore, CMGAN achieved state-of-the-art results on the VoiceBank+DEMAND dataset [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>].</p>
          <p>On the other hand, time-domain approaches directly estimate the clean audio from the raw signal, encompassing both the magnitude and phase information, enabling them to enhance noisy speech in both domains jointly. Macartney and Weyde [<xref ref-type="bibr" rid="ref46">46</xref>] leveraged Wave-U-Net, proposed in the study by Thiemann et al [<xref ref-type="bibr" rid="ref44">44</xref>], to use the U-Net structure in a 1D time domain and demonstrated promising results in audio source separation for speech enhancement. Wave-U-Net uses a series of downsampling and upsampling blocks with skip connections to make predictions. However, its effectiveness in representing long signal sequences is limited due to its restricted receptive field. To overcome this limitation, the approaches presented in the studies by Pandey and Wang [<xref ref-type="bibr" rid="ref47">47</xref>] and Wang et al [<xref ref-type="bibr" rid="ref48">48</xref>] divided the signals into small chunks and repeatedly processed local and global information to expand the receptive field. This dual-path structure successfully improved the efficiency in capturing long sequential features. However, dual-path structures are not memory efficient as they require retaining the entire long signal during training. To address the memory efficiency issue, Park et al [<xref ref-type="bibr" rid="ref49">49</xref>] proposed a multi-view attention network. They used residual conformer blocks to enrich channel representation and introduced multi-view attention blocks consisting of channel, global, and local attention mechanisms, enabling the extraction of features that reflect both local and global information. This approach also demonstrated state-of-the-art performance on the VoiceBank+DEMAND dataset [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>].</p>
          <p>Both approaches have made significant progress in performance improvements in recent years. However, their suitability for enhancing respiratory sounds collected through stethoscopes remains unclear. Therefore, for this study, we applied these 2 branches of enhancement models and compared their effectiveness in enhancing respiratory sounds in real-world noisy hospital settings [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref49">49</xref>].</p>
        </sec>
        <sec>
          <title>Respiratory Sound Classification</title>
          <p>In recent years, automatic respiratory sound classification systems have become an active research area. Several studies have explored the use of pretrained weights from deep learning models, showing promising results. Kim et al [<xref ref-type="bibr" rid="ref6">6</xref>] demonstrated improved performance over SVMs by fine-tuning the pretrained VGG16 algorithm. Gairola et al [<xref ref-type="bibr" rid="ref22">22</xref>] used effective preprocessing methods, data augmentation techniques, and transfer learning from ImageNet [<xref ref-type="bibr" rid="ref50">50</xref>] pretrained weights to address data scarcity and further enhance performance.</p>
          <p>As large-scale audio datasets [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>] become more accessible, pretrained audio models are gaining traction, exhibiting promising performance in various audio tasks [<xref ref-type="bibr" rid="ref53">53</xref>-<xref ref-type="bibr" rid="ref55">55</xref>]. Studies have explored leveraging these pretrained audio models for respiratory sound classification. Moummad and Farrugia [<xref ref-type="bibr" rid="ref17">17</xref>] incorporated supervised contrastive loss on metadata with the pretrained 6-layer CNN architecture [<xref ref-type="bibr" rid="ref53">53</xref>] to improve the quality of learned features from the encoder. Chang et al [<xref ref-type="bibr" rid="ref56">56</xref>] introduced a novel gamma patch-wise correction augmentation technique, which they applied to the fine-tuned 14-layer CNN (CNN14) architecture [<xref ref-type="bibr" rid="ref53">53</xref>], achieving state-of-the-art performance. Bae et al [<xref ref-type="bibr" rid="ref16">16</xref>] used the pretrained Audio Spectrogram Transformer (AST) [<xref ref-type="bibr" rid="ref54">54</xref>] with a Patch-Mix strategy to prevent overfitting and improve performance. Kim et al [<xref ref-type="bibr" rid="ref57">57</xref>] proposed a representation-level augmentation technique to effectively leverage different pretrained models with various input types, demonstrating promising results on the pretrained ResNet, EfficientNet, 6-layer CNN, and AST.</p>
          <p>However, few of these studies have explicitly addressed the challenge of noise robustness in clinical settings. To improve noise robustness, data augmentation techniques such as adding white noise, time shifting, stretching, and pitch shifting have been commonly used [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. These augmentations enable networks to learn efficient features under diverse recording conditions. Nonetheless, the augmented recordings may not accurately represent the conditions in clinical settings, potentially introducing artifacts and limiting performance improvement. In contrast to the aforementioned works, Kochetov et al [<xref ref-type="bibr" rid="ref18">18</xref>] proposed a noise-masking recurrent neural network to filter out noisy frames during classification. They concatenated a binary noise classifier and an anomaly classifier with a mask layer to suppress the noisy parts, allowing only the filtered frames to pass through, thereby preventing noises from affecting the classification. However, the International Conference in Biomedical and Health Informatics (ICBHI) database lacks noise labels in the metadata, and the paper did not specify how these labels were obtained, rendering the results nonreproducible. Emmanouilidou et al [<xref ref-type="bibr" rid="ref58">58</xref>] used multiple noise suppression techniques to address various noise sources, including ambient noise, signal artifacts, heart sounds, and crying, using a soft-margin nonlinear SVM classifier with handcrafted features. Similarly, our work uses a pipeline for noise enhancement and respiratory sound classification. However, we advanced this approach by using deep learning models for both tasks, enabling our system to handle diverse noise types and levels without the need for bespoke strategies for each noise source. Furthermore, we validated our system’s practical utility through experiments across 2 respiratory sound databases and a physician validation study, demonstrating its improved performance and clinical relevance.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Datasets</title>
        <p>This section presents 2 respiratory sound datasets and 1 clinical noise dataset used in this study.</p>
        <sec>
          <title>ICBHI 2017 Dataset</title>
          <p>The ICBHI 2017 database is one of the largest publicly accessible datasets for respiratory sounds, comprising a total of 5.5 hours of recorded audio [<xref ref-type="bibr" rid="ref59">59</xref>]. These recordings were independently collected by 2 research teams in Portugal and Greece from 126 participants of all ages (79 adults, 46 children, and 1 unknown). The data acquisition process involved heterogeneous equipment and included recordings from both clinical and nonclinical environments. The duration of the recorded audio varies from 10 to 90 seconds. Within this database, 6898 respiratory cycles result in 920 annotated audio samples. Among these samples, 1864 contain crackles, 886 contain wheezes, and 506 include both crackles and wheezes, whereas the remaining cycles are categorized as normal.</p>
        </sec>
        <sec>
          <title>Formosa Archive of Breath Sound</title>
          <p>The Formosa Archive of Breath Sound (FABS) database comprises 14.6 hours of respiratory sound recordings collected from 1985 participants. Our team collected these recordings at the emergency department of the Hsin-Chu Branch at the National Taiwan University Hospital (NTUH). We used the CaRDIaRT DS101 electronic stethoscope, where each recording is 10 seconds long.</p>
          <p>To ensure the accuracy of the annotations, a team of 7 senior physicians meticulously annotated the audio samples. The annotations focused on identifying coarse crackles, wheezes, or normal respiratory sounds. Unlike the ICBHI 2017 database, our annotation process treated each audio sample in its entirety rather than splitting it into respiratory cycles. This approach reduces the need for extensive segmentation procedures and aligns with regular clinical practice. To enhance the quality of the annotations, we implemented an annotation validation flow called “cross-annotator model validation.” This involved training multiple models based on each annotator’s data and validating the models on data from other annotators. Any data with incongruent predictions were initially identified. These data then underwent additional annotation by 3 senior physicians randomly selected from the original annotation team for each sample to achieve the final consensus label. The FABS database encompasses 5238 annotated recordings, with 715 containing coarse crackles, 234 containing wheezes, and 4289 labeled as normal respiratory sound recordings. The detailed comparison between the ICBHI 2017 dataset and the FABS database is shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Comparison between the International Conference in Biomedical and Health Informatics (ICBHI) and Formosa Archive of Breath Sound (FABS) datasets.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="200"/>
              <col width="0"/>
              <col width="470"/>
              <col width="0"/>
              <col width="300"/>
              <thead>
                <tr valign="top">
                  <td colspan="3">
                    <break/>
                  </td>
                  <td colspan="2">ICBHI (n=126 patients)</td>
                  <td>FABS (n=1985 patients)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="3">Age (y), mean (SD)</td>
                  <td colspan="2">42.99 (32.08)</td>
                  <td>66.04 (17.64)</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">BMI (kg/m<sup>2</sup>), mean (SD)</td>
                  <td colspan="2">27.19 (5.34)</td>
                  <td>23.95 (4.72)</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Sex, n (%)</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Male</td>
                  <td colspan="2">79 (62.7)</td>
                  <td colspan="2">974 (49.1)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Female</td>
                  <td colspan="2">46 (36.5)</td>
                  <td colspan="2">841 (42.4)</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Unknown</td>
                  <td colspan="2">1 (0.8)</td>
                  <td colspan="2">170 (8.6)</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">Sampling rate (kHz)</td>
                  <td colspan="2">4-44.1</td>
                  <td>16</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">Duration (hours)</td>
                  <td colspan="2">5.5</td>
                  <td>14.6</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">Label</td>
                  <td colspan="2">Crackle and wheeze, crackle, wheeze, and normal</td>
                  <td>Coarse crackle, wheeze, and normal</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">Equipment</td>
                  <td colspan="2">AKG C417L microphone, Littmann Classic II SE stethoscope, Littmann 3200 electronic stethoscope, and Welch Allyn Meditron electronic stethoscope</td>
                  <td>CaRDIaRT DS101 electronic stethoscope</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>NTUH Clinical Noise Dataset</title>
          <p>The noise dataset used in this study was sourced from the NTUH Hsin-Chu Branch. To replicate the noise sounds that physicians typically encounter in real-world clinical settings, we used the CaRDIaRT DS101 electronic stethoscope for collecting the noise samples. The NTUH clinical noise dataset consists of 3 different types of clinical noises: 8 friction noises produced by the stethoscope moving on different fabric materials; 18 environment noises recorded at various locations within the hospital; and 12 patient noises generated by patients during auscultation through conversations, coughing, and snoring.</p>
        </sec>
      </sec>
      <sec>
        <title>Proposed Methods</title>
        <p>As shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>, our proposed noise-robust respiratory sound classification system includes two main components: (1) audio enhancement and (2) respiratory sound classifier.</p>
        <sec>
          <title>Audio Enhancement Module</title>
          <p>Audio enhancement is usually approached as a supervised learning problem [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], where the goal is to map noisy respiratory sound inputs to their clean counterparts. Mathematically, this task can be represented as learning a function <italic>f</italic>, mapping <italic>X</italic><sub>noisy</sub> to <italic>X</italic><sub>clean</sub>, where <italic>X</italic><sub>noisy</sub> represents the input noisy sound and <italic>X</italic><sub>clean</sub> denotes the corresponding clean sound. The enhanced output, <italic>X’</italic><sub>clean</sub>, is obtained as <italic>X’</italic><sub>clean</sub>=<italic>f</italic>(<italic>X</italic><sub>noisy</sub>) (1), where <italic>f</italic> is the audio enhancement model optimized during training.</p>
          <p>To achieve high-quality enhancement, it is crucial to carefully select reference clean recordings from the respiratory sound database to generate high-quality paired noisy-clean sound data. To address this, we used an “audio-tagging filter” approach. This approach leverages a large pretrained audio-tagging model to identify clean samples and exclude recordings with irrelevant tags from the database. Specifically, we used the CNN14 pretrained audio neural network [<xref ref-type="bibr" rid="ref53">53</xref>] that was trained on AudioSet [<xref ref-type="bibr" rid="ref51">51</xref>], a comprehensive audio dataset containing 2,063,839 training audio clips sourced from YouTube covering 527 sound classes. Audio samples with the following audio event labels were filtered out: “music,” “speech,” “fire,” “animal,” “cat,” and “domestic animals, pets.” These labels were chosen as they were among the top commonest predictions of the audio-tagging model, indicating a higher likelihood of significant irrelevant noise in the recordings. By excluding these labels, we could ensure that the selected recordings could be used as reference clean audio. To validate the effectiveness of the filtering process, we manually checked the filtered recordings. The results showed that the tagging precision was 92.5%, indicating that this method is efficient and trustworthy. Moreover, as it is fully automatic, it is easy to reproduce the results.</p>
          <p>In the ICBHI 2017 database, 889 clean audio samples were retained after filtering, consisting of 1812 cycles with crackling sounds, 822 cycles with wheezing sounds, 447 cycles with both crackling and wheezing sounds, and 3538 cycles with normal respiratory sounds. Alternatively, the filtered FABS clean samples comprised 699 recordings of coarse crackle respiratory sounds, 225 recordings of wheeze respiratory sounds, and 4238 recordings of normal respiratory sounds.</p>
          <p>In this study, we used Wave-U-Net [<xref ref-type="bibr" rid="ref46">46</xref>], Phase-and-Harmonics–Aware Speech Enhancement Network (PHASEN) [<xref ref-type="bibr" rid="ref32">32</xref>], Multi-View Attention Network for Noise Erasure [<xref ref-type="bibr" rid="ref49">49</xref>], and CMGAN [<xref ref-type="bibr" rid="ref43">43</xref>] to compare the effectiveness of different model structures in enhancing respiratory sounds.</p>
        </sec>
        <sec>
          <title>Respiratory Sound Classification</title>
          <p>Training a classification model from scratch using a limited dataset may lead to suboptimal performance or overfitting. Therefore, we selected the CNN14 model proposed in the study by Kong et al [<xref ref-type="bibr" rid="ref53">53</xref>], which had been pretrained on AudioSet [<xref ref-type="bibr" rid="ref51">51</xref>], as our main classification backbone, and we further fine-tuned it on our respiratory datasets. We used log-mel spectrograms as the input feature, similar to previous works in respiratory sound classifications [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. As the dataset is highly imbalanced, we used the balanced batch-learning strategy. To further improve model generalizability and performance, we incorporated data augmentation techniques, including Mixup [<xref ref-type="bibr" rid="ref60">60</xref>] and SpecAugment [<xref ref-type="bibr" rid="ref61">61</xref>], along with triplet loss [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref62">62</xref>] to enhance feature separability.</p>
          <p>Mathematically, the classification task is formulated as a multi-class classification problem. The goal is to learn a mapping function, <italic>g</italic>: <italic>Z</italic>→<italic>Y</italic> (2), where <italic>Z</italic> represents the extracted features and <italic>Y</italic> denotes the target class labels. To obtain <italic>Z</italic>, input-enhanced audio signals <italic>X’</italic><sub>clean</sub> are transformed using the STFT to generate a spectrogram, followed by mel-filter banks to convert the frequency scale to the mel scale: <italic>Z</italic>=log-mel(STFT[<italic>X’</italic><sub>clean</sub>]) (3).</p>
          <p>During training, the total loss function <italic>L</italic><sub>c</sub> combines cross-entropy loss and triplet loss: <italic>L</italic><sub>c</sub>=<italic>L</italic><sub>CE</sub>+λ<italic>L</italic><sub>triplet</sub> (4).</p>
          <p>Through grid search, λ=0.01 leads to the best performance.</p>
        </sec>
        <sec>
          <title>Physician Validation Study</title>
          <p>To further evaluate the effectiveness of audio enhancement for respiratory sound, we conducted a physician validation study using the clean, noisy, and enhanced recordings from a randomly selected 25% of the testing set on the ICBHI 2017 database. In this study, we invited 7 senior physicians to independently annotate these recordings without access to any noise level or respiratory sound class label. We instructed the physicians to label the respiratory class with a confidence score ranging from 1 to 5. The objective was to demonstrate that our proposed method not only enhances the performance of the classification model but also improves the accuracy of the respiratory sound classification and increases the confidence in manual judgment done by physicians. The physician validation study was a critical step in validating the clinical practicality and effectiveness of our proposed audio enhancement preprocessing technique in clinical settings.</p>
        </sec>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study was approved by the institutional review board of the NTUH Hsin-Chu Branch (109-129-E) and complies with ethical guidelines for human research. It involved both prospective and retrospective data collection, with retrospective data fully deidentified to protect participant privacy. All prospective participants provided informed consent before data collection. No financial compensation was provided to participants, ensuring voluntary and unbiased participation.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overview</title>
        <p>To assess the noise robustness of our proposed method, we conducted a comparative analysis using methods across various levels of noise intensity, as outlined in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <boxed-text id="box1" position="float">
          <title>Methods for various levels of noise intensity.</title>
          <p>
            <bold>Clean</bold>
          </p>
          <p>The respiratory sound classification models were only trained on clean data and tested on clean data. This approach served to establish the upper-bound performance for the overall comparison.</p>
          <p>
            <bold>Noisy</bold>
          </p>
          <p>The respiratory sound classification models were trained on clean data but tested on noisy data. As the models were not optimized for noise robustness, a significant drop in performance was expected.</p>
          <p>
            <bold>Noise injection</bold>
          </p>
          <p>The respiratory sound classification models were trained on synthesized noisy data and tested on noisy data. This approach represents the conventional method to enhance the noise robustness of the model.</p>
          <p>
            <bold>Audio enhancement</bold>
          </p>
          <p>The audio enhancement model functions as a front-end preprocessing step for the classification model. To achieve this, we first optimized the audio enhancement model to achieve a satisfactory enhancement performance. Subsequently, the respiratory sound classification model was trained on the enhanced data and tested on the enhanced data.</p>
        </boxed-text>
      </sec>
      <sec>
        <title>Experiment Setup</title>
        <p>To evaluate the efficiency of our proposed method, we followed a similar setup as that in prior work [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] to have an 80%-20% train-test split on the database. Furthermore, the training set was mixed with the noise recordings from the NTUH clinical noise dataset with 4 SNRs (15, 10, 5, and 0 dB) with random time shifting. The test set was mixed with unseen noise data with 4 SNRs (17.5, 12.5, 7.5, and 2.5 dB), also subjected to random time shifting. For evaluation, we used the metrics of accuracy, sensitivity, specificity, and ICBHI score. Sensitivity is defined as the recall of abnormal respiratory sounds. Specificity refers to the recall of normal respiratory sounds. The ICBHI score, calculated as the average of sensitivity and specificity, provides a balanced measure of the model’s classification performance.</p>
      </sec>
      <sec>
        <title>Implementation Details</title>
        <sec>
          <title>Technical Setup</title>
          <p>The models were implemented using PyTorch (version 1.12; Meta AI) with the CUDA Toolkit (version 11.3; NVIDIA Corporation) for graphics processing unit acceleration. Training was conducted on an NVIDIA A100 graphics processing unit with 80 GB of memory. For clarity and reproducibility, the detailed implementation and computational setup is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        </sec>
        <sec>
          <title>Preprocessing</title>
          <p>We first resampled all recordings to 16 kHz. Next, each respiratory cycle was partitioned into 10-second audio segments before proceeding with feature extraction. In cases in which cycles were shorter in duration, we replicated and concatenated them to form 10-second clips in the ICBHI dataset. As the recordings in the FABS dataset are initially labeled per recording, there was no requirement for a segmentation process. Subsequently, these audio clips were mixed with the NTUH clinical noise dataset, generating pairs of noisy and clean data for further processing.</p>
        </sec>
        <sec>
          <title>Enhancement Model Training</title>
          <p>For enhancement model training, the 10-second audio clips were divided into 4-second segments. When implementing Wave-U-Net [<xref ref-type="bibr" rid="ref43">43</xref>], the channel size was set to 24, the batch size was set to 4, and the number of layers of convolution upsampling and downsampling was set to 8. The model was trained using the Adam optimizer with a learning rate of 10<sup>−5</sup> for 40 epochs when training using pretrained weights and 10<sup>−4</sup> for 30 epochs when training from scratch. For the Multi-View Attention Network for Noise Erasure model [<xref ref-type="bibr" rid="ref49">49</xref>], the channel size was set to 60, the batch size was set to 4, and the number of layers of up and down convolution was set to 4. The model was trained using the Adam optimizer with a learning rate of 10<sup>−6</sup> for 10 epochs when training using pretrained weights and a learning rate of 10<sup>−5</sup> for 10 epochs when training from scratch. When implementing PHASEN [<xref ref-type="bibr" rid="ref32">32</xref>], which is trained in the time-frequency domain, we followed the original setup using a Hamming window of 25 ms in length and a hop size of 10 ms to generate STFT spectrograms. The number of 2-stream blocks was set to 3, the batch size was set to 4, the channel number for the amplitude stream was set to 24, and the channel number for the phase stream was set to 12. The model was trained using the Adam optimizer with a learning rate of 5 × 10<sup>–5</sup> for 20 epochs when training using pretrained weights and a learning rate of 5 × 10<sup>–4</sup> for 30 epochs when training from scratch. For CMGAN [<xref ref-type="bibr" rid="ref43">43</xref>], we followed the original setting using a Hamming window of 25 ms in length and a hop size of 6.25 ms to generate STFT spectrograms. The number of 2-stage conformer blocks was set to 4, the batch size was set to 4, and the channel number in the generator was set to 64. The channel numbers in the discriminator were set to 16, 32, 64, and 128. The model was trained using the Adam optimizer with a learning rate of 5 × 10<sup>–5</sup> for 20 epochs when training using pretrained weights and a learning rate of 5 × 10<sup>–4</sup> for 30 epochs when training from scratch. These hyperparameters are also listed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
          <p>The pretrained weights for these models were trained on the VoiceBank+DEMAND dataset [<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref45">45</xref>], which is commonly used in speech enhancement research.</p>
        </sec>
        <sec>
          <title>Classification Model Training</title>
          <p>For the classification model, the 4-second enhanced segments were concatenated back into 10-second audio clips. To generate the log-mel spectrogram, the waveform was transformed using STFT with a Hamming window size of 512 and a hop size of 160 samples. The STFT spectrogram was then processed through 64 mel filter banks to generate the log-mel spectrogram. In the training stage, we set the batch size to 32 and used the Adam optimizer with a learning rate of 10<sup>−4</sup> for 14,000 iterations using pretrained weights from the model trained on the 16-kHz AudioSet dataset [<xref ref-type="bibr" rid="ref51">51</xref>]. These hyperparameters are also listed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        </sec>
      </sec>
      <sec>
        <title>Evaluation Outcomes</title>
        <p>In this study, we compared the classification performance of conventional noisy data augmentation with our proposed audio-enhanced preprocessing. The test set was split into 2 groups, and each classification model was trained 10 times, yielding 20 values for statistical analysis. We conducted a 1-tailed <italic>t</italic> test to assess whether models trained on CMGAN-enhanced audio using pretrained weights showed significant improvements over other models. In addition, we reported speech quality metrics for various audio enhancement models and analyzed their correlation with classification performance.</p>
        <p>The experiment results, as shown in <xref ref-type="table" rid="table2">Table 2</xref>, highlight the effectiveness of our proposed audio enhancement preprocessing strategy for noise-robust performances. In the case of the ICBHI 2017 database, the model trained solely on clean data experienced a 33.95% drop in the ICBHI score when evaluated on the synthesized noisy dataset. Noise injection improved the score by 19.73%, but fine-tuning PHASEN achieved the highest score, outperforming noise injection by 2.28%. Regarding the FABS database, using the classification model trained on clean recordings on the noisy recordings led to a 12.48% drop in the ICBHI score. Noise injection improved performance by 1.31%, but fine-tuning CMGAN outperformed noise injection by 2.79%. Across both datasets, the audio enhancement preprocessing method consistently improved performance compared to the noise injection augmentation technique. Furthermore, it showed improved sensitivity for all enhancement model structures, with the most significant improvement being 6.31% for the ICBHI database and 13.54% for the FABS database. This indicates that the audio enhancement preprocessing method enhanced the classification model’s ability to distinguish abnormal respiratory sounds, which is crucial for the early detection of potential illnesses in clinical use.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Comparison of classification performance on both the International Conference in Biomedical and Health Informatics (ICBHI) and Formosa Archive of Breath Sound (FABS) datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="100"/>
            <col width="120"/>
            <col width="120"/>
            <col width="70"/>
            <col width="130"/>
            <col width="70"/>
            <col width="130"/>
            <col width="70"/>
            <col width="90"/>
            <col width="70"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Method</td>
                <td>Enhancement model</td>
                <td>Accuracy, mean (SD)</td>
                <td><italic>P</italic> value</td>
                <td>Sensitivity, mean (SD)</td>
                <td><italic>P</italic> value</td>
                <td>Specificity, mean (SD)</td>
                <td><italic>P</italic> value</td>
                <td>ICBHI score, mean (SD)</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="11">
                  <bold>ICBHI</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clean</td>
                <td>—<sup>a</sup></td>
                <td>79.90 (0.01)</td>
                <td>&gt;.99</td>
                <td>71.43 (0.02)</td>
                <td>&gt;.99</td>
                <td>87.27 (0.01)</td>
                <td>&gt;.99</td>
                <td>79.35 (0.01)</td>
                <td>&gt;.99</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noisy</td>
                <td>—</td>
                <td>45.70 (0.03)</td>
                <td>&lt;.001</td>
                <td>40.99 (0.04)</td>
                <td>&lt;.001</td>
                <td>49.80 (0.08)</td>
                <td>&lt;.001</td>
                <td>45.40 (0.03)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noise injection</td>
                <td>—</td>
                <td>65.85 (0.01)</td>
                <td>&lt;.001</td>
                <td>54.89 (0.04)</td>
                <td>&lt;.001</td>
                <td>75.37 (0.04)</td>
                <td>.98</td>
                <td>65.13 (0.01)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE<sup>b</sup></td>
                <td>Wave-U-Net</td>
                <td>60.86 (0.02)</td>
                <td>&lt;.001</td>
                <td>55.35 (0.04)</td>
                <td>&lt;.001</td>
                <td>65.66 (0.05)</td>
                <td>&lt;.001</td>
                <td>60.50 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>Wave-U-Net<sup>c</sup></td>
                <td>61.29 (0.02)</td>
                <td>&lt;.001</td>
                <td>55.04 (0.02)</td>
                <td>&lt;.001</td>
                <td>66.72 (0.04)</td>
                <td>&lt;.001</td>
                <td>60.88 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>PHASEN<sup>d</sup></td>
                <td>66.81 (0.01)</td>
                <td>.02</td>
                <td>57.61 (0.03)</td>
                <td>.001</td>
                <td>74.81 (0.04)</td>
                <td>.91</td>
                <td>66.21 (0.01)</td>
                <td>.005</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>PHASEN<sup>c</sup></td>
                <td>68.09<sup>e</sup> (0.01)</td>
                <td>.84</td>
                <td>57.71<sup>f</sup> (0.03)</td>
                <td>.004</td>
                <td>77.12<sup>f</sup> (0.04)</td>
                <td>&gt;.99</td>
                <td>67.41<sup>e</sup> (0.01)</td>
                <td>.64</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>MANNER<sup>g</sup></td>
                <td>67.62 (0.01)</td>
                <td>.39</td>
                <td>53.09 (0.03)</td>
                <td>&lt;.001</td>
                <td>80.26<sup>e</sup> (0.04)</td>
                <td>&gt;.99</td>
                <td>66.67 (0.01)</td>
                <td>.03</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>MANNER<sup>c</sup></td>
                <td>60.36 (0.02)</td>
                <td>&lt;.001</td>
                <td>57.67 (0.02)</td>
                <td>&lt;.001</td>
                <td>62.70 (0.04)</td>
                <td>&lt;.001</td>
                <td>60.19 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>CMGAN<sup>h</sup></td>
                <td>64.75 (0.01)</td>
                <td>&lt;.001</td>
                <td>55.84 (0.03)</td>
                <td>&lt;.001</td>
                <td>72.50 (0.02)</td>
                <td>.17</td>
                <td>64.17 (0.01)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>CMGAN<sup>c</sup></td>
                <td>67.70<sup>f</sup> (0.01)</td>
                <td>—</td>
                <td>61.20<sup>e</sup> (0.03)</td>
                <td>—</td>
                <td>73.35 (0.02)</td>
                <td>—</td>
                <td>67.28<sup>f</sup> (0.01)</td>
                <td>—</td>
              </tr>
              <tr valign="top">
                <td colspan="11">
                  <bold>FABS</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clean</td>
                <td>—</td>
                <td>85.02 (0.01)</td>
                <td>&gt;.99</td>
                <td>62.07 (0.04)</td>
                <td>&gt;.99</td>
                <td>90.01 (0.02)</td>
                <td>&lt;.001</td>
                <td>76.04 (0.02)</td>
                <td>&gt;.99</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noisy</td>
                <td>—</td>
                <td>81.02 (0.02)</td>
                <td>&lt;.001</td>
                <td>36.41 (0.04)</td>
                <td>&lt;.001</td>
                <td>90.71 (0.02)</td>
                <td>.004</td>
                <td>63.56 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noise injection</td>
                <td>—</td>
                <td>84.53 (0.01)</td>
                <td>&gt;.99</td>
                <td>34.29 (0.05)</td>
                <td>&lt;.001</td>
                <td>95.44 (0.01)</td>
                <td>&gt;.99</td>
                <td>64.87 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>Wave-U-Net</td>
                <td>85.97<sup>e</sup> (0.01)</td>
                <td>&gt;.99</td>
                <td>36.74 (0.03)</td>
                <td>&lt;.001</td>
                <td>96.66<sup>f</sup> (0.01)</td>
                <td>&gt;.99</td>
                <td>66.70 (0.01)</td>
                <td>.04</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>Wave-U-Net<sup>c</sup></td>
                <td>85.88<sup>f</sup> (0.01)</td>
                <td>&gt;.99</td>
                <td>29.08 (0.05)</td>
                <td>&lt;.001</td>
                <td>98.22<sup>e</sup> (0.01)</td>
                <td>&gt;.99</td>
                <td>63.65 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>PHASEN</td>
                <td>85.29 (0.004)</td>
                <td>&gt;.99</td>
                <td>33.64 (0.02)</td>
                <td>&lt;.001</td>
                <td>96.51 (0.01)</td>
                <td>&gt;.99</td>
                <td>65.07 (0.01)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>PHASEN<sup>c</sup></td>
                <td>85.33 (0.01)</td>
                <td>&gt;.99</td>
                <td>35.82 (0.03)</td>
                <td>&lt;.001</td>
                <td>96.09 (0.01)</td>
                <td>&gt;.99</td>
                <td>65.95 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>MANNER</td>
                <td>83.01 (0.01)</td>
                <td>.05</td>
                <td>37.50 (0.08)</td>
                <td>.01</td>
                <td>92.89 (0.03)</td>
                <td>.67</td>
                <td>65.20 (0.03)</td>
                <td>.004</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>MANNER<sup>c</sup></td>
                <td>79 (0.03)</td>
                <td>&lt;.001</td>
                <td>47.83<sup>e</sup> (0.06)</td>
                <td>&gt;.99</td>
                <td>85.77 (0.05)</td>
                <td>&lt;.001</td>
                <td>66.80<sup>f</sup> (0.02)</td>
                <td>.08</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>CMGAN</td>
                <td>82.47 (0.01)</td>
                <td>&lt;.001</td>
                <td>37.61 (0.05)</td>
                <td>&lt;.001</td>
                <td>92.22 (0.01)</td>
                <td>.19</td>
                <td>64.91 (0.02)</td>
                <td>&lt;.001</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td>CMGAN<sup>c</sup></td>
                <td>83.67 (0.01)</td>
                <td>—</td>
                <td>42.77<sup>f</sup> (0.03)</td>
                <td>—</td>
                <td>92.55 (0.01)</td>
                <td>—</td>
                <td>67.66<sup>e</sup> (0.01)</td>
                <td>—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Without any audio enhancement module.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>AE: audio enhancement.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>The model is fine-tuned from the pretrained weight.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>PHASEN: Phase-and-Harmonics–Aware Speech Enhancement Network.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>Best performance across all methods for this metric.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>Second-best performance across all methods for this metric.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>MANNER: Multi-View Attention Network for Noise Erasure.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>CMGAN: convolution-augmented transformer–based metric generative adversarial network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Comparing the 2 types of enhancement approaches, the time-frequency domain models (PHASEN and CMGAN) exhibited better performance in terms of ICBHI scores. In addition, CMGAN consistently showed high sensitivity across both datasets, indicating its potential for preserving respiratory sound features during audio enhancement. The spectrogram of the audio enhanced using CMGAN also revealed that it preserves more high-frequency information across all respiratory sound classes, as illustrated in <xref rid="figure2" ref-type="fig">Figure 2</xref>. In contrast, audio enhanced using other models either lost high-frequency information or retained too much noise, leading to misclassification as normal, resulting in higher specificity for those models. Moreover, we observed that, while our focus was on training a respiratory sound enhancement model, using pretrained weights from models trained on the VoiceBank+DEMAND dataset, which were originally designed for speech, still significantly improved classification performance in most cases. This highlights the cross-domain effectiveness of pretrained weights from the speech domain in respiratory sound tasks.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The log-mel spectrograms of 4 different types of respiratory sounds on the International Conference in Biomedical and Health Informatics 2017 database. Each subfigure contains clean audio, noisy audio, and 4 types of enhanced audio from different audio enhancement approaches. CMGAN: convolution-augmented transformer–based metric generative adversarial network; MANNER: Multi-View Attention Network for Noise Erasure; PHASEN: Phase-and-Harmonics–Aware Speech Enhancement Network.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e67239_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>To evaluate whether speech quality metrics, originally designed for speech, are effective for respiratory sounds, we analyzed their correlation with the ICBHI score and sensitivity. As shown in <xref ref-type="table" rid="table3">Table 3</xref>, the mean opinion score (MOS) of background noise intrusiveness (CBAK) and segmental SNR (SSNR) exhibited relatively higher correlations than other metrics, such as PESQ, STOI, the MOS of signal distortion, and the MOS of overall quality. Unlike these other metrics, which are primarily designed to assess speech intelligibility and quality, CBAK and SSNR focus on background noise intrusiveness and the SNR between recordings. This distinction explains why CBAK and SSNR show stronger correlations with classification performance, highlighting their potential applicability for respiratory sound analysis.</p>
        <p>We evaluated the inference times of 4 audio enhancement models. Wave-U-Net generates 1 second of enhanced audio in just 1.5 ms, PHASEN does so in 3.9 ms, and MANNER does so in 11.7 ms. In contrast, CMGAN processes 1 second of audio in 26 ms—a longer time that is offset by its superior classification performance. </p>
        <p>To further analyze the effectiveness of our proposed audio enhancement preprocessing method in handling different types of noise, we compared its performance using the noise injection method across various SNR levels. On the basis of the consistently outstanding performance of CMGAN across both datasets, we selected it for further analysis.</p>
        <p>On the ICBHI database, as illustrated in <xref rid="figure3" ref-type="fig">Figure 3</xref>, the noise injection method performed better with environmental noises at SNR values of 2.5 and 12.5 dB. However, the front-end audio enhancement consistently performed better for patient and friction noises across almost all noise levels.</p>
        <p>Regarding the FABS dataset, as shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>, the noise injection method performed better with environmental and friction noises at an SNR value of 17.5 dB and patient noises at an SNR value of 2.5 and 7.5 dB. In all other situations, the audio enhancement preprocessing method demonstrated superior ICBHI scores.</p>
        <p>These results suggest that our proposed strategy effectively mitigates the effects of various noise types while maintaining strong classification performance. This highlights the robustness and reliability of our approach in handling diverse noise scenarios and intensities, showcasing its potential for practical applications in clinical settings.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Comparison of audio enhancement (AE) performance on both the International Conference in Biomedical and Health Informatics (ICBHI) and Formosa Archive of Breath Sound (FABS) datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="23"/>
            <col width="82"/>
            <col width="0"/>
            <col width="156"/>
            <col width="0"/>
            <col width="133"/>
            <col width="0"/>
            <col width="102"/>
            <col width="0"/>
            <col width="98"/>
            <col width="0"/>
            <col width="105"/>
            <col width="0"/>
            <col width="109"/>
            <col width="0"/>
            <col width="98"/>
            <col width="0"/>
            <col width="94"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Method</td>
                <td colspan="2">Enhancement model</td>
                <td colspan="2">Parameters (millions)</td>
                <td colspan="2">PESQ<sup>a,b</sup></td>
                <td colspan="2">CSIG<sup>c,d</sup></td>
                <td colspan="2">CBAK<sup>e,f</sup></td>
                <td colspan="2">COVL<sup>g,h</sup></td>
                <td colspan="2">SSNR<sup>i,j</sup></td>
                <td>STOI<sup>k,l</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="18">
                  <bold>ICBHI</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noisy</td>
                <td colspan="2">—<sup>m</sup></td>
                <td colspan="2">—</td>
                <td colspan="2">0.58</td>
                <td colspan="2">2.98</td>
                <td colspan="2">2.83</td>
                <td colspan="2">2.13</td>
                <td colspan="2">14.10</td>
                <td colspan="2">0.50</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">Wave-U-Net</td>
                <td colspan="2">3.3</td>
                <td colspan="2">0.56</td>
                <td colspan="2">3.07</td>
                <td colspan="2">3.25</td>
                <td colspan="2">2.18</td>
                <td colspan="2">20.30</td>
                <td colspan="2">0.49</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">Wave-U-Net<sup>n</sup></td>
                <td colspan="2">3.3</td>
                <td colspan="2">0.57</td>
                <td colspan="2">3.10</td>
                <td colspan="2">3.25</td>
                <td colspan="2">2.20</td>
                <td colspan="2">20.20</td>
                <td colspan="2">0.50</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">PHASEN<sup>o</sup></td>
                <td colspan="2">7.7</td>
                <td colspan="2">0.57</td>
                <td colspan="2">3.07</td>
                <td colspan="2">3.34</td>
                <td colspan="2">2.19</td>
                <td colspan="2">21.41</td>
                <td colspan="2">0.52</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">PHASEN<sup>n</sup></td>
                <td colspan="2">7.7</td>
                <td colspan="2">0.56</td>
                <td colspan="2">3.04</td>
                <td colspan="2">3.32</td>
                <td colspan="2">2.17</td>
                <td colspan="2">21.26</td>
                <td colspan="2">0.51</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">MANNER<sup>p</sup></td>
                <td colspan="2">24</td>
                <td colspan="2">0.59</td>
                <td colspan="2">3.23</td>
                <td colspan="2">3.24</td>
                <td colspan="2">2.27</td>
                <td colspan="2">19.85</td>
                <td colspan="2">0.55</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">MANNER<sup>n</sup></td>
                <td colspan="2">24</td>
                <td colspan="2">0.66</td>
                <td colspan="2">3.38<sup>q</sup></td>
                <td colspan="2">3.24</td>
                <td colspan="2">2.39<sup>r</sup></td>
                <td colspan="2">19.17</td>
                <td colspan="2">0.60<sup>r</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">CMGAN<sup>s</sup></td>
                <td colspan="2">1.8</td>
                <td colspan="2">0.75<sup>q</sup></td>
                <td colspan="2">3.31<sup>r</sup></td>
                <td colspan="2">3.46<sup>r</sup></td>
                <td colspan="2">2.40<sup>q</sup></td>
                <td colspan="2">22.06<sup>r</sup></td>
                <td colspan="2">0.61<sup>q</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">CMGAN<sup>n</sup></td>
                <td colspan="2">1.8</td>
                <td colspan="2">0.74<sup>r</sup></td>
                <td colspan="2">3.29</td>
                <td colspan="2">3.47<sup>q</sup></td>
                <td colspan="2">2.38</td>
                <td colspan="2">22.31<sup>q</sup></td>
                <td colspan="2">0.61<sup>q</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="18">
                  <bold>FABS</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noisy</td>
                <td colspan="2">—</td>
                <td colspan="2">—</td>
                <td colspan="2">2.10</td>
                <td colspan="2">3.80<sup>q</sup></td>
                <td colspan="2">3.41</td>
                <td colspan="2">3.03<sup>q</sup></td>
                <td colspan="2">12.99</td>
                <td colspan="2">0.62<sup>r</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">Wave-U-Net</td>
                <td colspan="2">3.3</td>
                <td colspan="2">1.78</td>
                <td colspan="2">1.96</td>
                <td colspan="2">3.16</td>
                <td colspan="2">1.90</td>
                <td colspan="2">10.97</td>
                <td colspan="2">0.52</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">Wave-U-Net<sup>n</sup></td>
                <td colspan="2">3.3</td>
                <td colspan="2">1.75</td>
                <td colspan="2">1.89</td>
                <td colspan="2">3.13</td>
                <td colspan="2">1.86</td>
                <td colspan="2">10.74</td>
                <td colspan="2">0.50</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">PHASEN</td>
                <td colspan="2">7.7</td>
                <td colspan="2">1.93</td>
                <td colspan="2">2.34</td>
                <td colspan="2">3.26</td>
                <td colspan="2">2.19</td>
                <td colspan="2">11.54</td>
                <td colspan="2">0.58</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">PHASEN<sup>n</sup></td>
                <td colspan="2">7.7</td>
                <td colspan="2">1.84</td>
                <td colspan="2">2.11</td>
                <td colspan="2">3.20</td>
                <td colspan="2">2.03</td>
                <td colspan="2">11.27</td>
                <td colspan="2">0.57</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">MANNER</td>
                <td colspan="2">24</td>
                <td colspan="2">2.14<sup>r</sup></td>
                <td colspan="2">3.35</td>
                <td colspan="2">3.44<sup>r</sup></td>
                <td colspan="2">2.81</td>
                <td colspan="2">12.87</td>
                <td colspan="2">0.61</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">MANNER<sup>n</sup></td>
                <td colspan="2">24</td>
                <td colspan="2">2.18<sup>q</sup></td>
                <td colspan="2">3.57<sup>r</sup></td>
                <td colspan="2">3.44<sup>r</sup></td>
                <td colspan="2">2.95<sup>r</sup></td>
                <td colspan="2">12.57</td>
                <td colspan="2">0.63<sup>q</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">CMGAN</td>
                <td colspan="2">1.8</td>
                <td colspan="2">2.01</td>
                <td colspan="2">1.79</td>
                <td colspan="2">3.42</td>
                <td colspan="2">1.96</td>
                <td colspan="2">13.59<sup>r</sup></td>
                <td colspan="2">0.59</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>AE</td>
                <td colspan="2">CMGAN<sup>n</sup></td>
                <td colspan="2">1.8</td>
                <td colspan="2">2.06</td>
                <td colspan="2">1.68</td>
                <td colspan="2">3.48<sup>q</sup></td>
                <td colspan="2">1.91</td>
                <td colspan="2">13.98<sup>q</sup></td>
                <td colspan="2">0.59</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>PESQ: perceptual evaluation of speech quality.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>ICBHI: sensitivity correlation coefficient=0.36 and ICBHI score correlation coefficient=0.23; FABS: sensitivity correlation coefficient=0.72 and ICBHI score correlation coefficient=0.16.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>CSIG: mean opinion score (MOS) of signal distortion.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>ICBHI: sensitivity correlation coefficient=0.51 and ICBHI score correlation coefficient=0.40; FABS: sensitivity correlation coefficient=0.34 and ICBHI score correlation coefficient=–0.25.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>CBAK: MOS of background noise intrusiveness.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>ICBHI: sensitivity correlation coefficient=0.92 and ICBHI score correlation coefficient=0.90; FABS: sensitivity correlation coefficient=0.71 and ICBHI score correlation coefficient=0.23.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>CVOL: MOS of overall quality.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>ICBHI: sensitivity correlation coefficient=0.52 and ICBHI score correlation coefficient=0.39; FABS: sensitivity correlation coefficient=0.42 and ICBHI score correlation coefficient=–0.20.</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>SSNR: segmental signal-to-noise ratio.</p>
            </fn>
            <fn id="table3fn10">
              <p><sup>j</sup>ICBHI: sensitivity correlation coefficient=0.92 and ICBHI score correlation coefficient=0.93; FABS: sensitivity correlation coefficient=0.59 and ICBHI score correlation coefficient=0.22.</p>
            </fn>
            <fn id="table3fn11">
              <p><sup>k</sup>STOI: short-time objective intelligibility.</p>
            </fn>
            <fn id="table3fn12">
              <p><sup>l</sup>ICBHI: sensitivity correlation coefficient=0.45 and ICBHI score correlation coefficient=0.36; FABS: sensitivity correlation coefficient=0.68 and ICBHI score correlation coefficient=0.13.</p>
            </fn>
            <fn id="table3fn13">
              <p><sup>m</sup>Without any audio enhancement module.</p>
            </fn>
            <fn id="table3fn14">
              <p><sup>n</sup>The model is fine-tuned from the pretrained weight.</p>
            </fn>
            <fn id="table3fn15">
              <p><sup>o</sup>PHASEN: Phase-and-Harmonics–Aware Speech Enhancement Network.</p>
            </fn>
            <fn id="table3fn16">
              <p><sup>p</sup>MANNER: Multi-View Attention Network for Noise Erasure.</p>
            </fn>
            <fn id="table3fn17">
              <p><sup>q</sup>Best performance across all methods for this metric.</p>
            </fn>
            <fn id="table3fn18">
              <p><sup>r</sup>Second-best performance across all methods for this metric.</p>
            </fn>
            <fn id="table3fn19">
              <p><sup>s</sup>CMGAN: convolution-augmented transformer–based metric generative adversarial network.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Performance comparison of different approaches for each noise type with various signal-to-noise ratio (SNR) values on the International Conference in Biomedical and Health Informatics (ICBHI) 2017 database.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e67239_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Performance comparison of different approaches for each noise type with various signal-to-noise ratio (SNR) values on the Formosa Archive of Breath Sound database. ICBHI: International Conference in Biomedical and Health Informatics.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e67239_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Physician Validation Study</title>
        <p>To assess the practical utility of our proposed approach in clinical settings, we conducted a physician validation study using the ICBHI dataset. This study involved comparing the annotation results provided by 7 senior physicians under 3 different conditions: clean, noisy, and enhanced recordings. By evaluating physician assessments across these conditions, we aimed to determine the effectiveness of our enhancement approach in improving diagnostic accuracy and confidence.</p>
        <p>As shown in <xref ref-type="table" rid="table4">Table 4</xref>, the presence of noise in the recordings had a noticeable impact on the physicians’ ability to conduct a reliable judgment, reducing accuracy by 1.81% and sensitivity by 6.46% compared to the clean recordings. However, the recordings with audio enhancement exhibited notable improvement, with a 3.92% increase in accuracy and an 11.61% increase in sensitivity compared to the noisy recordings. The enhanced audio successfully preserved sound characteristics crucial for physicians in classifying respiratory sounds, leading to higher true positive rates in distinguishing adventitious sounds.</p>
        <p>The enhanced audio recordings also received higher annotation confidence scores than the noisy recordings, as indicated in <xref rid="figure5" ref-type="fig">Figure 5</xref> and <xref ref-type="table" rid="table4">Table 4</xref>. Moreover, the speech quality metrics PESQ, MOS of signal distortion, CBAK, MOS of overall quality, SSNR, and STOI positively correlated with the physicians’ annotation confidence, as shown in <xref rid="figure6" ref-type="fig">Figure 6</xref>. These results underscore the potential of audio enhancement preprocessing techniques for practical application in real-world clinical settings.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Annotation results from physicians on different types of recordings.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="150"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="200"/>
            <thead>
              <tr valign="bottom">
                <td>Type of recording</td>
                <td>Accuracy (%)</td>
                <td>Sensitivity (%)</td>
                <td>Specificity (%)</td>
                <td>ICBHI<sup>a</sup> score (%)</td>
                <td>Confidence mean (SD)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Clean</td>
                <td>49.4</td>
                <td>23.23</td>
                <td>72.32</td>
                <td>47.77</td>
                <td>2.88 (1.50)</td>
              </tr>
              <tr valign="top">
                <td>Noisy</td>
                <td>47.59</td>
                <td>16.77</td>
                <td>74.58</td>
                <td>45.68</td>
                <td>2.32 (1.29)</td>
              </tr>
              <tr valign="top">
                <td>Enhanced</td>
                <td>51.51</td>
                <td>28.38</td>
                <td>71.75</td>
                <td>50.07</td>
                <td>2.65 (1.36)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>ICBHI: International Conference in Biomedical and Health Informatics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Physicians’ annotation confidence score comparison among clean, noisy, and enhanced recordings.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e67239_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Relationship between physicians’ annotation confidence score and speech quality metrics. CBAK: mean opinion score (MOS) of background noise intrusiveness; CSIG: MOS of signal distortion; CVOL: MOS of overall quality; PESQ: perceptual evaluation of speech quality; SSNR: segmental signal-to-noise ratio; STOI: short-time objective intelligibility.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e67239_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ablation Study</title>
        <sec>
          <title>Other Classification Model</title>
          <p>To assess the effectiveness of our proposed speech enhancement preprocessing technique with different classification models, we conducted an ablation study. The hyperparameters used in this study are detailed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. We used the fine-tuned CMGAN as the speech enhancement module as it showed consistently outstanding performance in previous experiments, as shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
          <p>For the ICBHI dataset, the speech enhancement preprocessing technique increased the sensitivity by 11.71% and the ICBHI score by 1.4% when using the AST model [<xref ref-type="bibr" rid="ref54">54</xref>]. Similarly, when using the AST model with the Patch-Mix strategy [<xref ref-type="bibr" rid="ref16">16</xref>], the speech enhancement preprocessing technique increased the sensitivity by 17.08% and the ICBHI score by 1.6%, as shown in <xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref>.</p>
          <p>Regarding the FABS dataset, the speech enhancement preprocessing technique increased the sensitivity by 18.48% and the ICBHI score by 5.46% when fine-tuning the AST model [<xref ref-type="bibr" rid="ref54">54</xref>]. When fine-tuning the AST model using the Patch-Mix strategy [<xref ref-type="bibr" rid="ref16">16</xref>], the speech enhancement preprocessing technique increased the sensitivity by 13.04% and the ICBHI score by 0.68%, as shown in <xref ref-type="table" rid="table7">Tables 7</xref> and <xref ref-type="table" rid="table8">8</xref>.</p>
          <p>These results demonstrate that the speech enhancement preprocessing technique effectively improves the performance of various respiratory sound classification models, including fine-tuning the AST and AST using the Patch-Mix strategy, on both the ICBHI and FABS datasets.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Comparison of the classification performance on the International Conference in Biomedical and Health Informatics (ICBHI) database by fine-tuning the Audio Spectrogram Transformer [<xref ref-type="bibr" rid="ref54">54</xref>].</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="170"/>
              <col width="190"/>
              <col width="190"/>
              <col width="200"/>
              <thead>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Accuracy (%)</td>
                  <td>Sensitivity (%)</td>
                  <td>Specificity (%)</td>
                  <td>ICBHI score (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Clean</td>
                  <td>70.65</td>
                  <td>64.88</td>
                  <td>75.67</td>
                  <td>70.27</td>
                </tr>
                <tr valign="top">
                  <td>Noisy</td>
                  <td>24.13</td>
                  <td>30.41</td>
                  <td>18.67</td>
                  <td>24.54</td>
                </tr>
                <tr valign="top">
                  <td>Noise injection</td>
                  <td>53.78</td>
                  <td>35.28</td>
                  <td>69.87</td>
                  <td>52.58</td>
                </tr>
                <tr valign="top">
                  <td>Audio enhancement</td>
                  <td>54.46</td>
                  <td>46.99</td>
                  <td>60.96</td>
                  <td>53.98</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <table-wrap position="float" id="table6">
            <label>Table 6</label>
            <caption>
              <p>Comparison of the classification performance on the International Conference in Biomedical and Health Informatics (ICBHI) database using the Patch-Mix training strategy from the Audio Spectrogram Transformer pretrained weight [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="170"/>
              <col width="190"/>
              <col width="190"/>
              <col width="200"/>
              <thead>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Accuracy (%)</td>
                  <td>Sensitivity (%)</td>
                  <td>Specificity (%)</td>
                  <td>ICBHI score (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Clean</td>
                  <td>70.73</td>
                  <td>61.79</td>
                  <td>78.5</td>
                  <td>70.14</td>
                </tr>
                <tr valign="top">
                  <td>Noisy</td>
                  <td>29.05</td>
                  <td>35.45</td>
                  <td>23.48</td>
                  <td>29.46</td>
                </tr>
                <tr valign="top">
                  <td>Noise injection</td>
                  <td>58.02</td>
                  <td>23.9</td>
                  <td>87.69</td>
                  <td>55.8</td>
                </tr>
                <tr valign="top">
                  <td>Audio enhancement</td>
                  <td>58.55</td>
                  <td>40.98</td>
                  <td>73.83</td>
                  <td>57.4</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <table-wrap position="float" id="table7">
            <label>Table 7</label>
            <caption>
              <p>Comparison of the classification performance on the Formosa Archive of Breath Sound database by fine-tuning the Audio Spectrogram Transformer [<xref ref-type="bibr" rid="ref54">54</xref>].</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="170"/>
              <col width="190"/>
              <col width="190"/>
              <col width="200"/>
              <thead>
                <tr valign="bottom">
                  <td>
                    <break/>
                  </td>
                  <td>Accuracy (%)</td>
                  <td>Sensitivity (%)</td>
                  <td>Specificity (%)</td>
                  <td>ICBHI<sup>a</sup> score (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Clean</td>
                  <td>85.74</td>
                  <td>46.74</td>
                  <td>94.21</td>
                  <td>70.48</td>
                </tr>
                <tr valign="top">
                  <td>Noisy</td>
                  <td>83.03</td>
                  <td>36.96</td>
                  <td>93.03</td>
                  <td>65</td>
                </tr>
                <tr valign="top">
                  <td>Noise injection</td>
                  <td>83.8</td>
                  <td>31.52</td>
                  <td>95.16</td>
                  <td>63.34</td>
                </tr>
                <tr valign="top">
                  <td>Audio enhancement</td>
                  <td>80.89</td>
                  <td>50</td>
                  <td>87.6</td>
                  <td>68.8</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table7fn1">
                <p><sup>a</sup>ICBHI: International Conference in Biomedical and Health Informatics.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <table-wrap position="float" id="table8">
            <label>Table 8</label>
            <caption>
              <p>Comparison of the classification performance on the Formosa Archive of Breath Sound database using the Patch-Mix training strategy from the Audio Spectrogram Transformer pretrained weight [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="170"/>
              <col width="190"/>
              <col width="190"/>
              <col width="200"/>
              <thead>
                <tr valign="bottom">
                  <td>
                    <break/>
                  </td>
                  <td>Accuracy (%)</td>
                  <td>Sensitivity (%)</td>
                  <td>Specificity (%)</td>
                  <td>ICBHI<sup>a</sup> score (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Clean</td>
                  <td>86.13</td>
                  <td>42.39</td>
                  <td>95.63</td>
                  <td>69.01</td>
                </tr>
                <tr valign="top">
                  <td>Noisy</td>
                  <td>82.15</td>
                  <td>29.35</td>
                  <td>93.62</td>
                  <td>61.49</td>
                </tr>
                <tr valign="top">
                  <td>Noise injection</td>
                  <td>82.44</td>
                  <td>44.57</td>
                  <td>90.67</td>
                  <td>67.62</td>
                </tr>
                <tr valign="top">
                  <td>Audio enhancement</td>
                  <td>75.17</td>
                  <td>57.61</td>
                  <td>78.98</td>
                  <td>68.3</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table8fn1">
                <p><sup>a</sup>ICBHI: International Conference in Biomedical and Health Informatics.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Metric Discriminator</title>
          <p>Given that the metric discriminator optimizes PESQ, a metric primarily used in the speech domain for speech quality, a potential mismatch problem may arise when applied to respiratory sound tasks. To explore this issue, we conducted ablation studies on CMGAN’s discriminator, examining the conformer generator-only model, the conformer generative adversarial network without PESQ estimation discriminator (with normal discriminator), and the complete setup (with metric discriminator). As shown in <xref ref-type="table" rid="table9">Table 9</xref>, the addition of a metric discriminator improved overall accuracy, sensitivity, and ICBHI score. This outcome indicates a positive contribution of the metric discriminator on PESQ to respiratory sound classification.</p>
          <table-wrap position="float" id="table9">
            <label>Table 9</label>
            <caption>
              <p>Classification results of the convolution-augmented transformer–based metric generative adversarial network with different discriminator setups on the International Conference in Biomedical and Health Informatics (ICBHI) 2017 database.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="250"/>
              <col width="170"/>
              <col width="190"/>
              <col width="190"/>
              <col width="200"/>
              <thead>
                <tr valign="top">
                  <td>Setup</td>
                  <td>Accuracy (%)</td>
                  <td>Sensitivity (%)</td>
                  <td>Specificity (%)</td>
                  <td>ICBHI score (%)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Generator only</td>
                  <td>65.81</td>
                  <td>58.21</td>
                  <td>72.42</td>
                  <td>65.32</td>
                </tr>
                <tr valign="top">
                  <td>With normal discriminator</td>
                  <td>66.19</td>
                  <td>55.61</td>
                  <td>75.39</td>
                  <td>65.5</td>
                </tr>
                <tr valign="top">
                  <td>With metric discriminator</td>
                  <td>66.72</td>
                  <td>62.28</td>
                  <td>70.58</td>
                  <td>66.43</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This paper proposes a deep learning audio enhancement preprocessing pipeline for respiratory sound classification tasks. We also introduced a collection of clinical noise and a real-world respiratory sound database from the emergency department of the Hsin-Chu Branch at the NTUH. Our noise-robust method enhances model performance in noisy environments and provides physicians with improved audio recordings for manual assessment even under heavy noise conditions.</p>
        <p>The experimental results indicated that audio enhancement significantly improved performance across all 3 types of noise commonly encountered during auscultation. Specifically, our approach achieved a 2.15% improvement (<italic>P</italic>&lt;.001) over the conventional noise injection method on the ICBHI dataset and outperformed it by 2.79% (<italic>P</italic>&lt;.001) on the FABS dataset. Moreover, time-frequency–domain enhancement techniques demonstrated superior performance for this task. Analyzing the correlation between classification performance and speech quality metrics, we observed that CBAK and SSNR exhibited higher correlations with ICBHI scores. These metrics are strongly influenced by background noise but are unrelated to speech intelligibility, aligning with the experimental settings. In the physician validation study, enhanced recordings showed an 11.61% increase in sensitivity and a 14.22% improvement in classification confidence. A positive correlation was also observed between speech quality metrics and diagnostic confidence, highlighting the effectiveness of enhanced recordings in aiding physicians in detecting abnormal respiratory sounds. Our ablation study on various classification model structures revealed that audio enhancement preprocessing consistently improved performance. The findings showed enhanced sensitivity and higher ICBHI scores across both databases when tested with 2 state-of-the-art respiratory sound classification models. Furthermore, incorporating the metric discriminator PESQ was found to enhance downstream classification performance.</p>
        <p>These findings validate the feasibility and effectiveness of integrating deep learning–based audio enhancement techniques into respiratory sound classification systems, addressing the critical challenge of noise robustness and paving the way for the development of reliable clinical decision support tools.</p>
      </sec>
      <sec>
        <title>Limitations and Future Work</title>
        <p>Despite the encouraging findings in this study, there is a need to explore the co-optimization of front-end audio enhancement and classification models. As most audio enhancement tasks primarily focus on speech, the evaluation metrics are not highly correlated with respiratory sounds, potentially leading to inefficient optimization. Addressing this issue is crucial for achieving better performance in respiratory sound classification in future work. Furthermore, future studies should incorporate other types of noise and more complex noise mixture strategies to enable the development of a more noise-robust respiratory sound classification model for real-world clinical use. By considering a diverse range of noise scenarios, the model can be better prepared to handle the variability and challenges encountered in actual clinical settings. In addition, we have to speed up the model inference by simplifying the model to make it suitable for real-time applications. At the same time, we must ensure that enhancement quality is maintained and critical respiratory sound characteristics are preserved. In our long-term future work, we aim to deploy this model in real clinical environments by integrating it into electronic stethoscopes. To ensure the method’s generalizability, we plan to collect cross-site respiratory sound recordings from 100 patients across various clinical environments. Of these recordings, data from 80 patients will be used for training, whereas data from the remaining 20 patients will be reserved for testing as part of a validation process aligned with Food and Drug Administration requirements. This approach will help validate the model’s performance and facilitate its adoption for practical use in clinical settings.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this study, we investigated the impact of incorporating a deep learning–based audio enhancement module into automatic respiratory sound classification systems. Our results demonstrated that this approach significantly improved the system’s robustness and clinical applicability, particularly in noisy environments. The enhanced audio not only improved classification performance on the ICBHI and FABS datasets but also increased diagnostic sensitivity and confidence among physicians. This study highlights the potential of audio enhancement as a critical component in developing reliable and trustworthy clinical decision support systems for respiratory sound analysis.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Details of the technical setup used in this study.</p>
        <media xlink:href="ai_v4i1e67239_app1.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Hyperparameters for training enhancement and classification models.</p>
        <media xlink:href="ai_v4i1e67239_app2.docx" xlink:title="DOCX File , 20 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AST</term>
          <def>
            <p>Audio Spectrogram Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CBAK</term>
          <def>
            <p>mean opinion score of background noise intrusiveness</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CMGAN</term>
          <def>
            <p>convolution-augmented transformer–based metric generative adversarial network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CNN</term>
          <def>
            <p>convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CNN14</term>
          <def>
            <p>14-layer convolutional neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">Conformer</term>
          <def>
            <p>convolution-augmented transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">FABS</term>
          <def>
            <p>Formosa Archive of Breath Sound</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">ICBHI</term>
          <def>
            <p>International Conference in Biomedical and Health Informatics</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MOS</term>
          <def>
            <p>mean opinion score</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NTUH</term>
          <def>
            <p>National Taiwan University Hospital</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">PESQ</term>
          <def>
            <p>perceptual evaluation of speech quality</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">PHASEN</term>
          <def>
            <p>Phase-and-Harmonics–Aware Speech Enhancement Network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SNR</term>
          <def>
            <p>signal-to-noise ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SSNR</term>
          <def>
            <p>segmental signal-to-noise ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">STFT</term>
          <def>
            <p>short-time Fourier transform</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">STOI</term>
          <def>
            <p>short-time objective intelligibility</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This research is funded by the National Science and Technology Council of Taiwan under grant 112-2320-B-002-044-MY3.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bohadana</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Izbicki</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Kraman</surname>
              <given-names>SS</given-names>
            </name>
          </person-group>
          <article-title>Fundamentals of lung auscultation</article-title>
          <source>N Engl J Med</source>
          <year>2014</year>
          <month>02</month>
          <day>20</day>
          <volume>370</volume>
          <issue>8</issue>
          <fpage>744</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmra1302901</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arts</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>EH</given-names>
            </name>
            <name name-style="western">
              <surname>van de Ven</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Heunks</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tuinman</surname>
              <given-names>PR</given-names>
            </name>
          </person-group>
          <article-title>The diagnostic accuracy of lung auscultation in adult patients with acute pulmonary pathologies: a meta-analysis</article-title>
          <source>Sci Rep</source>
          <year>2020</year>
          <month>04</month>
          <day>30</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>7347</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-020-64405-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-020-64405-6</pub-id>
          <pub-id pub-id-type="medline">32355210</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-020-64405-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC7192898</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kuo</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Wheezing, a significant clinical phenotype of COPD: experience from the Taiwan Obstructive Lung Disease Study</article-title>
          <source>Int J Chronic Obstr Pulm Dis</source>
          <year>2015</year>
          <month>10</month>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>2121</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.2147/copd.s92062</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Piirila</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sovijarvi</surname>
              <given-names>AR</given-names>
            </name>
          </person-group>
          <article-title>Crackles: recording, analysis and clinical significance</article-title>
          <source>Eur Respir J</source>
          <year>1995</year>
          <month>12</month>
          <day>01</day>
          <volume>8</volume>
          <issue>12</issue>
          <fpage>2139</fpage>
          <lpage>48</lpage>
          <pub-id pub-id-type="doi">10.1183/09031936.95.08122139</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chambres</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Hanna</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Desainte-Catherine</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Automatic detection of patient with respiratory diseases using lung sound analysis</article-title>
          <source>Proceedings of the International Conference on Content-Based Multimedia Indexing</source>
          <year>2018</year>
          <conf-name>CBMI 2018</conf-name>
          <conf-date>September 4-6, 2018</conf-date>
          <conf-loc>La Rochelle, France</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cbmi.2018.8516489</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hyon</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yoo</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ha</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Respiratory sound classification for crackles, wheezes, and rhonchi in the clinical field using deep learning</article-title>
          <source>Sci Rep</source>
          <year>2021</year>
          <month>08</month>
          <day>25</day>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>17186</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-021-96724-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-021-96724-7</pub-id>
          <pub-id pub-id-type="medline">34433880</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-021-96724-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC8387488</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wanasinghe</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bandara</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Madusanka</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Meedeniya</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bandara</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Díez</surname>
              <given-names>ID</given-names>
            </name>
          </person-group>
          <article-title>Lung sound classification with multi-feature integration utilizing lightweight CNN model</article-title>
          <source>IEEE Access</source>
          <year>2024</year>
          <volume>12</volume>
          <fpage>21262</fpage>
          <lpage>76</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2024.3361943</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pessoa</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Petmezas</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Papageorgiou</surname>
              <given-names>VE</given-names>
            </name>
            <name name-style="western">
              <surname>Rocha</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Stefanopoulos</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kilintzis</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Pediatric respiratory sound classification using a dual input deep learning architecture</article-title>
          <source>Proceedings of the IEEE Biomedical Circuits and Systems Conference</source>
          <year>2023</year>
          <conf-name>BioCAS 2023</conf-name>
          <conf-date>October 19-21, 2023</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <pub-id pub-id-type="doi">10.1109/biocas58349.2023.10388733</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Acharya</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Basu</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Deep neural network for respiratory sound classification in wearable devices enabled by patient specific model tuning</article-title>
          <source>IEEE Trans Biomed Circuits Syst</source>
          <year>2020</year>
          <month>06</month>
          <volume>14</volume>
          <issue>3</issue>
          <fpage>535</fpage>
          <lpage>44</lpage>
          <pub-id pub-id-type="doi">10.1109/TBCAS.2020.2981172</pub-id>
          <pub-id pub-id-type="medline">32191898</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>BW</given-names>
            </name>
          </person-group>
          <article-title>A glance-and-gaze network for respiratory sound classification</article-title>
          <source>Proceedings of the 2022 IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2022</year>
          <conf-name>ICASSP 2022</conf-name>
          <conf-date>May 23-27, 2022</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9746053</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Niu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Automatic respiratory sound classification via multi-branch temporal convolutional network</article-title>
          <source>2022 IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2022</year>
          <conf-name>ICASSP 2022</conf-name>
          <conf-date>May 23-27, 2022</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9746182</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Multi-view spectrogram transformer for respiratory sound classification</article-title>
          <source>Proceedings of the 2024 IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2024</year>
          <conf-name>ICASSP 2024</conf-name>
          <conf-date>April 14-19, 2024</conf-date>
          <conf-loc>Seoul, Republic of Korea</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp48485.2024.10445825</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Research on lung sound classification model based on dual-channel CNN-LSTM algorithm</article-title>
          <source>Biomed Signal Process Control</source>
          <year>2024</year>
          <month>08</month>
          <volume>94</volume>
          <fpage>106257</fpage>
          <pub-id pub-id-type="doi">10.1016/j.bspc.2024.106257</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Contrastive embeddind learning method for respiratory sound classification</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2021</year>
          <conf-name>ICASSP 2021</conf-name>
          <conf-date>June 6-11, 2021</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp39728.2021.9414385</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Satija</surname>
              <given-names>U</given-names>
            </name>
          </person-group>
          <article-title>AsthmaSCELNet: a lightweight supervised contrastive embedding learning framework for asthma classification using lung sounds</article-title>
          <source>Proceedings of the 24th INTERSPEECH Conference</source>
          <year>2023</year>
          <conf-name>INTERSPEECH 2023</conf-name>
          <conf-date>August 20-24, 2023</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <pub-id pub-id-type="doi">10.21437/interspeech.2023-428</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bae</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Baek</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Son</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ha</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tae</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yun</surname>
              <given-names>SY</given-names>
            </name>
          </person-group>
          <article-title>Patch-mix contrastive learning with audio spectrogram transformer on respiratory sound classification</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 23, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2305.14032"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2023-1426</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moummad</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Farrugia</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Pretraining respiratory sound representations using metadata and contrastive learning</article-title>
          <source>Proceedings of the IEEE Workshop on Applications of Signal Processing to Audio and Acoustics</source>
          <year>2023</year>
          <conf-name>WASPAA 2023</conf-name>
          <conf-date>October 22-25, 2023</conf-date>
          <conf-loc>New Paltz, NY</conf-loc>
          <pub-id pub-id-type="doi">10.1109/waspaa58266.2023.10248130</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kochetov</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Putin</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Balashov</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Filchenkov</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shalyto</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Noise masking recurrent neural network for respiratory sound classification</article-title>
          <source>Proceedings of the 27th International Conference on Artificial Neural Networks and Machine Learning</source>
          <year>2018</year>
          <conf-name>ICANN 2018</conf-name>
          <conf-date>October 4-7, 2018</conf-date>
          <conf-loc>Rhodes, Greece</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-030-01424-7_21</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>LungRN+NL: an improved adventitious lung sound classification using non-local block ResNet neural network with mixup data augmentation</article-title>
          <source>Proceedings of the INTERSPEECH 2020</source>
          <year>2020</year>
          <conf-name>INTERSPEECH 2020</conf-name>
          <conf-date>October 25-29, 2020</conf-date>
          <conf-loc>Virtual Event, China</conf-loc>
          <pub-id pub-id-type="doi">10.21437/interspeech.2020-2487</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>A domain transfer based data augmentation method for automated respiratory classification</article-title>
          <source>Proceedings of the 2022 IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2022</year>
          <conf-name>ICASSP 2022</conf-name>
          <conf-date>May 23-27, 2022</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9746941</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Pernkopf</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Lung sound classification using co-tuning and stochastic normalization</article-title>
          <source>IEEE Trans Biomed Eng</source>
          <year>2022</year>
          <month>9</month>
          <volume>69</volume>
          <issue>9</issue>
          <fpage>2872</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="doi">10.1109/tbme.2022.3156293</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gairola</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tom</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kwatra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>RespireNet: a deep neural network for accurately detecting abnormal lung sounds in limited data setting</article-title>
          <source>Proceedings of the 43rd Annual International Conference of the IEEE Engineering in Medicine &amp; Biology Society</source>
          <year>2021</year>
          <conf-name>EMBC 2021</conf-name>
          <conf-date>November 1-5, 2021</conf-date>
          <conf-loc>Mexico City, Mexico</conf-loc>
          <pub-id pub-id-type="doi">10.1109/embc46164.2021.9630091</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Respiratory sound classification based on BiGRU-attention network with XGBoost</article-title>
          <source>Proceedings of the IEEE International Conference on Bioinformatics and Biomedicine</source>
          <year>2020</year>
          <conf-name>BIBM 2020</conf-name>
          <conf-date>December 16-19, 2020</conf-date>
          <conf-loc>Seoul, Republic of Korea</conf-loc>
          <pub-id pub-id-type="doi">10.1109/bibm49941.2020.9313506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khullar</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Casalino</surname>
              <given-names>LP</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Aneja</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Public vs physician views of liability for artificial intelligence in health care</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2021</year>
          <month>07</month>
          <day>14</day>
          <volume>28</volume>
          <issue>7</issue>
          <fpage>1574</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33871009"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocab055</pub-id>
          <pub-id pub-id-type="medline">33871009</pub-id>
          <pub-id pub-id-type="pii">6237249</pub-id>
          <pub-id pub-id-type="pmcid">PMC8279784</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Diprose</surname>
              <given-names>WK</given-names>
            </name>
            <name name-style="western">
              <surname>Buist</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Thurier</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shand</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Physician understanding, explainability, and trust in a hypothetical machine learning risk calculator</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>04</month>
          <day>01</day>
          <volume>27</volume>
          <issue>4</issue>
          <fpage>592</fpage>
          <lpage>600</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32106285"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz229</pub-id>
          <pub-id pub-id-type="medline">32106285</pub-id>
          <pub-id pub-id-type="pii">5762808</pub-id>
          <pub-id pub-id-type="pmcid">PMC7647292</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shim</surname>
              <given-names>CS</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>MH Jr</given-names>
            </name>
          </person-group>
          <article-title>Relationship of wheezing to the severity of obstruction in asthma</article-title>
          <source>Arch Intern Med</source>
          <year>1983</year>
          <month>05</month>
          <volume>143</volume>
          <issue>5</issue>
          <fpage>890</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="medline">6679232</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kinoshita</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ochiai</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Delcroix</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nakatani</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Improving noise robust automatic speech recognition with single-channel time-domain enhancement network</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2020</year>
          <conf-name>ICASSP 2020</conf-name>
          <conf-date>May 4-8, 2020</conf-date>
          <conf-loc>Barcelona, Spain</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp40776.2020.9053266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pandey</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Saraf</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Dual application of speech enhancement for automatic speech recognition</article-title>
          <source>Proceedings of the IEEE Spoken Language Technology Workshop</source>
          <year>2021</year>
          <conf-name>SLT 2021</conf-name>
          <conf-date>January 19-22, 2021</conf-date>
          <conf-loc>Shenzhen, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/slt48900.2021.9383624</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Cornell</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Masuyama</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Scheibler</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>ZQ</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Watanabe</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ESPnet-SE++: speech enhancement for robust speech recognition, translation, and understanding</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on July 19, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2207.09514"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2022-10727</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Florencio</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Speech enhancement in multiple-noise conditions using deep neural networks</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 9, 2016</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1605.02427"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2016-88</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Seltzer</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>An investigation of deep neural networks for noise robust speech recognition</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2013</year>
          <conf-name>ICASSP 2013</conf-name>
          <conf-date>May 26-31, 2013</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2013.6639100</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>PHASEN: a phase-and-harmonics-aware speech enhancement network</article-title>
          <source>Proc AAAI Conf Artif Intell</source>
          <year>2020</year>
          <volume>34</volume>
          <issue>05</issue>
          <fpage>9458</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v34i05.6489</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bagchi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Plantinga</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Stiff</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fosler-Lussier</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Spectral feature mapping with MIMIC loss for robust speech recognition</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2018</year>
          <conf-name>ICASSP 2018</conf-name>
          <conf-date>April 15-20, 2018</conf-date>
          <conf-loc>Calgary, AB</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2018.8462622</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>TW</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Kawai</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>End-to-end waveform utterance enhancement for direct evaluation metrics optimization by fully convolutional neural networks</article-title>
          <source>IEEE/ACM Trans Audio Speech Lang Process</source>
          <year>2018</year>
          <month>9</month>
          <volume>26</volume>
          <issue>9</issue>
          <fpage>1570</fpage>
          <lpage>84</lpage>
          <pub-id pub-id-type="doi">10.1109/taslp.2018.2821903</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Koizumi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Niwa</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hioka</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kobayashi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Haneda</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>DNN-based source enhancement to increase objective sound quality assessment score</article-title>
          <source>IEEE/ACM Trans Audio Speech Lang Process</source>
          <year>2018</year>
          <month>10</month>
          <volume>26</volume>
          <issue>10</issue>
          <fpage>1780</fpage>
          <lpage>92</lpage>
          <pub-id pub-id-type="doi">10.1109/taslp.2018.2842156</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>CF</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>SD</given-names>
            </name>
          </person-group>
          <article-title>MetricGAN: generative adversarial networks based black-box metric scores optimization for speech enhancement</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 13, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1905.04874"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rix</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Beerends</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Hollier</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Hekstra</surname>
              <given-names>AP</given-names>
            </name>
          </person-group>
          <article-title>Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing</source>
          <year>2001</year>
          <conf-name>ICASSP 2001</conf-name>
          <conf-date>May 07-11, 2001</conf-date>
          <conf-loc>Salt Lake City, UT</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2001.941023</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taal</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Hendriks</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Heusdens</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A short-time objective intelligibility measure for time-frequency weighted noisy speech</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2010</year>
          <conf-name>ICASSP 2010</conf-name>
          <conf-date>March 14-19, 2010</conf-date>
          <conf-loc>Dallas, TX</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2010.5495701</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hsieh</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Plantinga</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ravanelli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>MetricGAN+: an improved version of MetricGAN for speech enhancement</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 8, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.03538"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2021-599</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yoshioka</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Continuous speech separation with conformer</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2021</year>
          <conf-name>ICASSP 2021</conf-name>
          <conf-date>June 6-11, 2021</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp39728.2021.9413423</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gulati</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Conformer: convolution-augmented transformer for speech recognition</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 16, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2005.08100"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2020-3015</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zeineldeen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lüscher</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gerstenberger</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schlüter</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Conformer-based hybrid ASR system for switchboard dataset</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source>
          <year>2022</year>
          <conf-name>ICASSP 2022</conf-name>
          <conf-date>May 23-27, 2022</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9746377</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Abdulatif</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>CMGAN: conformer-based metric GAN for speech enhancement</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on March 28, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2203.15149"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2022-517</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thiemann</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ito</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Vincent</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>The Diverse Environments Multi-channel Acoustic Noise Database (DEMAND): a database of multichannel environmental noise recordings</article-title>
          <source>Proc Mtgs Acoust</source>
          <year>2013</year>
          <month>5</month>
          <day>14</day>
          <volume>19</volume>
          <fpage>035081</fpage>
          <pub-id pub-id-type="doi">10.1121/1.4799597</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Valentini-Botinhao</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Noisy speech database for training speech enhancement algorithms and TTS models</article-title>
          <source>University of Edinburgh</source>
          <year>2017</year>
          <access-date>2025-02-28</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://datashare.ed.ac.uk/handle/10283/2791">https://datashare.ed.ac.uk/handle/10283/2791</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Macartney</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Weyde</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Improved speech enhancement with the Wave-U-Net</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on November 27, 2018</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1811.11307"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pandey</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Dual-path self-attention RNN for real-time speech enhancement</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on October 23, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2010.12713"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>WP</given-names>
            </name>
          </person-group>
          <article-title>TSTNN: two-stage transformer based neural network for speech enhancement in the time domain</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2021</year>
          <conf-name>ICASSP 2021</conf-name>
          <conf-date>June 06-11, 2021</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp39728.2021.9413740</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>BH</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>SW</given-names>
            </name>
          </person-group>
          <article-title>MANNER: multi-view attention network for noise erasure</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2022</year>
          <conf-name>ICASSP 2022</conf-name>
          <conf-date>May 23-27, 2022</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp43922.2022.9747120</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kai</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fei-Fei</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>ImageNet: a large-scale hierarchical image database</article-title>
          <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source>
          <year>2009</year>
          <conf-name>CVPR 2009</conf-name>
          <conf-date>June 20-25, 2009</conf-date>
          <conf-loc>Miami, FL</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvprw.2009.5206848</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gemmeke</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Ellis</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Freedman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jansen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lawrence</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>RC</given-names>
            </name>
          </person-group>
          <article-title>Audio set: an ontology and human-labeled dataset for audio events</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2017</year>
          <conf-name>ICASSP 2017</conf-name>
          <conf-date>March 5-9, 2017</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2017.7952261</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Piczak</surname>
              <given-names>KJ</given-names>
            </name>
          </person-group>
          <article-title>ESC: dataset for environmental sound classification</article-title>
          <source>Proceedings of the 23rd ACM International Conference on Multimedia</source>
          <year>2015</year>
          <conf-name>MM '15</conf-name>
          <conf-date>October 26-30, 2015</conf-date>
          <conf-loc>Brisbane, Australia</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2733373.2806390</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kong</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Iqbal</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Plumbley</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>PANNs: large-scale pretrained audio neural networks for audio pattern recognition</article-title>
          <source>IEEE/ACM Trans Audio Speech Lang Process</source>
          <year>2020</year>
          <volume>28</volume>
          <fpage>2880</fpage>
          <lpage>94</lpage>
          <pub-id pub-id-type="doi">10.1109/taslp.2020.3030497</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>YA</given-names>
            </name>
            <name name-style="western">
              <surname>Glass</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>AST: audio spectrogram transformer</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 5, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.01778"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2021-698</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>CI</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>YA</given-names>
            </name>
            <name name-style="western">
              <surname>Glass</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>SSAST: self-supervised audio spectrogram transformer</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on October 19, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2110.09784"/>
          </comment>
          <pub-id pub-id-type="doi">10.1609/aaai.v36i10.21315</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Tzeng</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>HY</given-names>
            </name>
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>EP</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CC</given-names>
            </name>
          </person-group>
          <article-title>GaP-Aug: gamma patch-wise correction augmentation method for respiratory sound classification</article-title>
          <source>Proceedings of the 2024 IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2024</year>
          <conf-name>ICASSP 2024</conf-name>
          <conf-date>April 14-19, 2024</conf-date>
          <conf-loc>Seoul, Republic of Korea</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp48485.2024.10447967</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Toikkanen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bae</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>HY</given-names>
            </name>
          </person-group>
          <article-title>RepAugment: input-agnostic representation-level augmentation for respiratory sound classification</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 5, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2405.02996"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/embc53108.2024.10782363</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Emmanouilidou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>McCollum</surname>
              <given-names>ED</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Elhilali</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Computerized lung sound screening for pediatric auscultation in noisy field environments</article-title>
          <source>IEEE Trans Biomed Eng</source>
          <year>2018</year>
          <month>07</month>
          <volume>65</volume>
          <issue>7</issue>
          <fpage>1564</fpage>
          <lpage>74</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28641244"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/TBME.2017.2717280</pub-id>
          <pub-id pub-id-type="medline">28641244</pub-id>
          <pub-id pub-id-type="pmcid">PMC5984191</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rocha</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Filos</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mendes</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Vogiatzis</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Perantoni</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kaimakamis</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Natsiavas</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Oliveira</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jácome</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Marques</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Paiva</surname>
              <given-names>RP</given-names>
            </name>
            <name name-style="western">
              <surname>Chouvarda</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Carvalho</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Maglaveras</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Α respiratory sound database for the development of automated classification</article-title>
          <source>Proceedings of the International Conference on Biomedical and Health Informatics</source>
          <year>2018</year>
          <conf-name>ICBHI 2017</conf-name>
          <conf-date>November 18-21, 2017</conf-date>
          <conf-loc>Thessaloniki, Greece</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-981-10-7419-6_6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cisse</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dauphin</surname>
              <given-names>YN</given-names>
            </name>
            <name name-style="western">
              <surname>Lopez-Paz</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>mixup: beyond empirical risk minimization</article-title>
          <source>Proceedings of the International Conference on Learning Representations</source>
          <year>2018</year>
          <conf-name>ICLR 2018</conf-name>
          <conf-date>April 30-May 3, 2018</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-981-19-9711-2_6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Park</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chiu</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Zoph</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cubuk</surname>
              <given-names>ED</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>QV</given-names>
            </name>
          </person-group>
          <article-title>SpecAugment: a simple data augmentation method for automatic speech recognition</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 18, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1904.08779"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2019-2680</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Triplet loss in Siamese network for object tracking</article-title>
          <source>Proceedings of the 15th European Conference on Computer Vision</source>
          <year>2018</year>
          <conf-name>ECCV 2018</conf-name>
          <conf-date>September 8-14, 2018</conf-date>
          <conf-loc>Munich, Germany</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-030-01261-8_28</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
