<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v2i1e46769</article-id>
      <article-id pub-id-type="pmid">38090533</article-id>
      <article-id pub-id-type="doi">10.2196/46769</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Self-Supervised Electroencephalogram Representation Learning for Automatic Sleep Staging: Model Development and Evaluation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Malin</surname>
            <given-names>Bradley</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wen</surname>
            <given-names>Jun</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mungoli</surname>
            <given-names>Neelesh</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Chaoqi</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5017-6114</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Xiao</surname>
            <given-names>Cao</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3869-6942</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Westover</surname>
            <given-names>M Brandon</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4803-312X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Sun</surname>
            <given-names>Jimeng</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Computer Science Department</institution>
            <institution>Carle's Illinois College of Medicine</institution>
            <institution>University of Illinois, Urbana Champaign</institution>
            <addr-line>201 N Goodwin Ave</addr-line>
            <addr-line>Urbana, IL, 61801</addr-line>
            <country>United States</country>
            <phone>1 9142698058</phone>
            <email>jimeng.sun@gmail.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1512-6426</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Computer Science Department</institution>
        <institution>Carle's Illinois College of Medicine</institution>
        <institution>University of Illinois, Urbana Champaign</institution>
        <addr-line>Urbana, IL</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Relativity Inc</institution>
        <addr-line>Chicago, IL</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Harvard Medical School</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Jimeng Sun <email>jimeng.sun@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>26</day>
        <month>7</month>
        <year>2023</year>
      </pub-date>
      <volume>2</volume>
      <elocation-id>e46769</elocation-id>
      <history>
        <date date-type="received">
          <day>24</day>
          <month>2</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>14</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>27</day>
          <month>5</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>2</day>
          <month>6</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Chaoqi Yang, Cao Xiao, M Brandon Westover, Jimeng Sun. Originally published in JMIR AI (https://ai.jmir.org), 26.07.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2023/1/e46769" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Deep learning models have shown great success in automating tasks in sleep medicine by learning from carefully annotated electroencephalogram (EEG) data. However, effectively using a large amount of raw EEG data remains a challenge.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this study, we aim to learn robust vector representations from massive unlabeled EEG signals, such that the learned vectorized features (1) are expressive enough to replace the raw signals in the sleep staging task, and (2) provide better predictive performance than supervised models in scenarios involving fewer labels and noisy samples.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We propose a self-supervised model, Contrast with the World Representation (ContraWR), for EEG signal representation learning. Unlike previous models that use a set of negative samples, our model uses global statistics (ie, the average representation) from the data set to distinguish signals associated with different sleep stages. The ContraWR model is evaluated on 3 real-world EEG data sets that include both settings: at-home and in-laboratory EEG recording.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>ContraWR outperforms 4 recently reported self-supervised learning methods on the sleep staging task across 3 large EEG data sets. ContraWR also supersedes supervised learning when fewer training labels are available (eg, 4% accuracy improvement when less than 2% of data are labeled on the Sleep EDF data set). Moreover, the model provides informative, representative feature structures in 2D projection.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>We show that ContraWR is robust to noise and can provide high-quality EEG representations for downstream prediction tasks. The proposed model can be generalized to other unsupervised physiological signal learning tasks. Future directions include exploring task-specific data augmentations and combining self-supervised methods with supervised methods, building upon the initial success of self-supervised learning reported in this study.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>physiological signals</kwd>
        <kwd>electroencephalogram</kwd>
        <kwd>EEG</kwd>
        <kwd>sleep staging</kwd>
        <kwd>sleep</kwd>
        <kwd>predict</kwd>
        <kwd>wearable devices</kwd>
        <kwd>wearable</kwd>
        <kwd>self-supervised learning</kwd>
        <kwd>digital health</kwd>
        <kwd>mHealth</kwd>
        <kwd>mobile health</kwd>
        <kwd>healthcare</kwd>
        <kwd>health care</kwd>
        <kwd>machine learning</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Deep learning models have shown great success in automating tasks in sleep medicine by learning from high-quality labeled electroencephalogram (EEG) data [<xref ref-type="bibr" rid="ref1">1</xref>]. EEG data are collected from patients wearing clinical sensors, which generate real-time multimodal signal data. A common challenge in classifying physiological signals, including EEG signals, is the lack of enough high-quality labels. This paper introduces a novel self-supervised model that leverages the inherent structure within large, unlabeled, and noisy data sets and produces robust feature representations. These representations can significantly enhance the performance of downstream classification tasks, such as sleep staging, especially in cases where only limited labeled data are available.</p>
      <p>Self-supervised learning (specifically, self-supervised contrastive learning) aims at learning a feature encoder that maps input signals into a vector representation using unlabeled data. Self-supervised methods involve two steps: (1) a <italic>pretrain</italic> step to learn the feature encoder without labels and (2) a <italic>supervised</italic> step to evaluate the learned encoder with a small amount of labeled data. During the pretrain step, some recent methods (eg, Momentum Contrast [MoCo] [<xref ref-type="bibr" rid="ref2">2</xref>] and the simple framework for contrastive learning of visual representations [SimCLR] [<xref ref-type="bibr" rid="ref3">3</xref>]) use the feature encoder to construct positive and negative pairs from the unlabeled data and then optimize the encoder by pushing positive pairs closer and negative pairs farther away. A positive pair consists of 2 different augmented versions of the same sample (ie, applying 2 data augmentation methods separately to the same sample), while a negative pair is generated from the augmented data of 2 different samples. For example, the augmentation method for EEG data can be denoising or channel flipping. In this practice, existing negative sampling strategies often incur sampling issues [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], especially for noisy EEG data, which significantly affects performance [<xref ref-type="bibr" rid="ref6">6</xref>]. Specifically, in the self-supervised learning setting (without labels), the negative samples are actually random samples, which may be from the same latent class. Using these “negative samples” can potentially undermine model performance.</p>
      <p>Technically, this study contributes to the pretrain step, where we address the aforementioned limitations of existing negative sampling strategies (eg, MoCo [<xref ref-type="bibr" rid="ref2">2</xref>] and SimCLR [<xref ref-type="bibr" rid="ref3">3</xref>]) by leveraging global data statistics. In contrastive learning, positive pairs provide similarity-related information, while negative pairs provide contrastive information. Both types of information are essential in learning an effective feature encoder. This study proposes a new contrastive learning method, named Contrast with the World Representation (ContraWR). In our ContraWR, we construct positive pairs using data augmentation, similar to existing methods, while we use one global average representation over the data set (called the <italic>world representation</italic>) as the negative sample to provide the contrastive information. Derived from global data statistics, the world representation is robust even in noisy environments, and it follows a new contrastive guidance in the absence of labels: <italic>the representation similarity between positive pairs is stronger than the similarity to the world representation</italic>. Moreover, in this study, we later strengthen our model with an instance-aware world representation for individual samples, where closer samples have larger weights in calculating the global average. Our experiments show that the instance-aware world representation makes the model more accurate, and this conclusion aligns with the findings from a previous paper [<xref ref-type="bibr" rid="ref6">6</xref>] that harder negative samples are more effective in learning feature encoding.</p>
      <p>We evaluated the proposed ContraWR on the sleep staging task with 3 real-world EEG data sets. Our model achieved results comparable to or better than those of recent popular self-supervised methods including MoCo [<xref ref-type="bibr" rid="ref2">2</xref>], SimCLR [<xref ref-type="bibr" rid="ref3">3</xref>], Bootstrap Your Own Latent (BYOL) [<xref ref-type="bibr" rid="ref7">7</xref>], and simple Siamese (SimSiam) [<xref ref-type="bibr" rid="ref8">8</xref>]. The results also show that self-supervised contrastive methods, especially our ContraWR method, are much more powerful in low-label scenarios than supervised learning (eg, 4% accuracy improvement on sleep staging with less than 2% training data of the Sleep EDF data set).</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>EEG Data Sets</title>
        <p>We considered 3 real-world EEG data sets for this study (the first 2 data sets entirely comprise at-home PSG recordings):</p>
        <list list-type="order">
          <list-item>
            <p>The data set of the Sleep Heart Health Study (SHHS) [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>] is a multicenter cohort study from the National Heart Lung &amp; Blood Institute (Bethesda, Maryland), assembled to study sleep-disordered breathing, which comprises 5804 adult patients older than 40 years and 5445 recordings in the first visit. We used first-visit polysomnography (PSG) data in the experiments. Each recording has 14 PSG channels, and the recording frequency is 125.0 Hz. We used the C3/A2 and C4/A1 EEG channels.</p>
          </list-item>
          <list-item>
            <p>The Sleep EDF [<xref ref-type="bibr" rid="ref11">11</xref>] cassette portion is another benchmark data set collected in a 1987-1991 study of age effects on sleep in healthy Caucasians. The data comprise 78 subjects aged 25-101 years who were taking non–sleep-related medications; the data set contains 153 full-night EEG recordings with a recording frequency of 100.0 Hz. We extracted the Fpz-Cz/Pz-Oz EEG channels as the raw inputs to the model.</p>
          </list-item>
          <list-item>
            <p>The Massachusetts General Hospital’s (MGH’s) MGH Sleep data set [<xref ref-type="bibr" rid="ref1">1</xref>] was collected from MGH’s sleep laboratory, which comprises more than 5000 individuals, where 6 EEG channels (ie, F3-M2, F4-M1, C3-M2, C4-M1, O1-M2, and O2-M1) were used for sleep staging, recorded at a 200.0-Hz frequency. After filtering out mismatched signals and missing labels, we finally curated 6478 recordings.</p>
          </list-item>
        </list>
        <p>The data set’s statistics can be found in <xref ref-type="table" rid="table1">Table 1</xref>, and the class label distribution is shown in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Data set statistics.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Name</td>
                <td>Location</td>
                <td>Channels, n</td>
                <td>Recordings, n</td>
                <td>Epochs, n</td>
                <td>Storage (GB)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Sleep Heart Health Study</td>
                <td>At home</td>
                <td>2</td>
                <td>5445</td>
                <td>4,535,949</td>
                <td>260</td>
              </tr>
              <tr valign="top">
                <td>Sleep EDF</td>
                <td>At home</td>
                <td>2</td>
                <td>153</td>
                <td>415,089</td>
                <td>20</td>
              </tr>
              <tr valign="top">
                <td>MGH<sup>a</sup> Sleep</td>
                <td>In the laboratory</td>
                <td>6</td>
                <td>6478</td>
                <td>4,863,523</td>
                <td>1322</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>MGH: Massachusetts General Hospital.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Class label distribution of the data sets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="170"/>
            <col width="170"/>
            <col width="160"/>
            <col width="170"/>
            <col width="170"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Name</td>
                <td colspan="5">Epochs, n (%)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>W</td>
                <td>N1</td>
                <td>N2</td>
                <td>N3</td>
                <td>R</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Sleep Heart Health Study</td>
                <td>1,306,742 (28.8)</td>
                <td>169,021 (3.7)</td>
                <td>1,856,130 (40.9)</td>
                <td>571,191 (12.6)</td>
                <td>632,865 (14.0)</td>
              </tr>
              <tr valign="top">
                <td>Sleep EDF</td>
                <td>285,561 (68.8)</td>
                <td>21,522 (5.2)</td>
                <td>69,132 (16.6)</td>
                <td>13,039 (3.2)</td>
                <td>25,835 (6.2)</td>
              </tr>
              <tr valign="top">
                <td>MGH<sup>a</sup> Sleep</td>
                <td>2,154,540 (44.3)</td>
                <td>481,488 (9.9)</td>
                <td>700,347 (14.4)</td>
                <td>855,980 (17.6)</td>
                <td>671,168 (13.8)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>MGH: Massachusetts General Hospital.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Problem Formulation</title>
        <p>To set up the experiments, the raw subject EEG recordings, which are multichannel brain waves, were used. First, the unlabeled subject recordings were grouped as the <italic>pretrain</italic> set, and the labeled recordings were grouped into the <italic>training</italic> or <italic>test</italic> sets. The training and test sets are usually small, but their EEG recordings are labeled, while the pretrain set contains a large number of unlabeled recordings. Within each set, the long recordings are segmented into disjoint 30-second windows. Each window is called <italic>an epoch</italic>, denoted as <italic>x</italic>∈R<italic><sup>C</sup></italic><sup>×</sup><italic><sup>N</sup></italic>. Each epoch has the same format: <italic>C</italic> input channels and <italic>N</italic> time stamps from each channel.</p>
        <p>For these data sets, the ground truth labels were released by the original data publishers. To align with the problem’s setting, participants were randomly assigned to the pretrain set, training set, and test set in different proportions (90%: 5%: 5% for the Sleep EDF and MGH sets and 98%: 1%: 1% for the SHHS set, since they have different amounts of data). All epochs segmented from a participant are placed within the same set. The pretrain set is used for self-supervised learning; hence, we removed their labels.</p>
        <p>In the pretrain step, the EEG self-supervised representation learning problem requires building a feature encoder <italic>f</italic>(⋅) from the pretrain set (without labels), which maps an epoch <italic>x</italic> into a vector representation <italic>h</italic>∈R<italic><sup>d</sup></italic>, where <italic>d</italic> is the feature dimensionality, such that the representation <italic>h</italic> can replace raw signals for downstream classification tasks. Evaluation of the encoder <italic>f</italic>(⋅) was conducted on the training and test data (with labels). We focus on sleep staging as the <italic>supervised</italic> step, where the feature vector of a sample <italic>x</italic> will be mapped to 5 sleep cycle labels, awake (W), rapid eye movement (REM; R), non-REM 1 (N1), non-REM 2 (N2), and non-REM 3 (N3), based on the American Academy of Sleep Medicine’s (AASM’s) scoring standards [<xref ref-type="bibr" rid="ref12">12</xref>]. Specifically, based on the feature encoder from the pretrain step, the training set is used to learn a linear model on top of the feature vectors, and the test set is used to evaluate the linear classification performance.</p>
      </sec>
      <sec>
        <title>Background and Existing Methods</title>
        <sec>
          <title>Overview</title>
          <p>Self-supervised learning occurs in the pretrain step, and it uses representation similarity to exploit the unlabeled signals, with an encoder network <italic>f</italic>(⋅):R<italic><sup>C</sup></italic><sup>×</sup><italic><sup>N</sup></italic>→R<italic><sup>d</sup></italic> and a nonlinear projection network <italic>g</italic>(⋅):R<italic><sup>d</sup></italic>→R<italic><sup>m</sup></italic>. Specifically, for a given signal <italic>x</italic> from the pretrain set, commonly, one applies data augmentation methods <italic>a</italic>(⋅) to produce 2 different modified signals <italic>x</italic> ̃', <italic>x</italic> ̃'' (after this procedure, the format does not change), which are then transformed into <italic>h</italic>', <italic>h</italic>''∈R<italic><sup>d</sup></italic> by <italic>f</italic>(⋅) and further into <italic>z</italic>', <italic>z</italic>''∈R<italic><sup>m</sup></italic> by <italic>g</italic>(⋅). The vectors <italic>z</italic>’, <italic>z</italic>’’ are finally normalized with the <italic>L2</italic> norm onto the unit hypersphere <inline-graphic xlink:href="ai_v2i1e46769_fig6.png" xlink:type="simple" mimetype="image"/>

.</p>
          <p>We call <inline-graphic xlink:href="ai_v2i1e46769_fig7.png" xlink:type="simple" mimetype="image"/> the <italic>anchor</italic>,  <inline-graphic xlink:href="ai_v2i1e46769_fig8.png" xlink:type="simple" mimetype="image"/> the <italic>positive sample</italic>, and these 2 together are called a <italic>positive pair</italic>. For the projections <italic>z<sub>k</sub></italic> obtained from other randomly selected signals (by negative sampling strategy), their representation  <inline-graphic xlink:href="ai_v2i1e46769_fig9.png" xlink:type="simple" mimetype="image"/> is commonly conceived of as negative samples (though they are random samples), and any one of them together with the anchor is called a <italic>negative pair</italic> in the existing literature [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. The loss function <italic>L</italic> is derived from the similarity comparison between positive and negative pairs (eg, encouraging the similarity of positive pairs to be stronger than that of all the negative pairs, referred to as the noise contrastive estimation loss [<xref ref-type="bibr" rid="ref13">13</xref>]). A common forward flow of self-supervised learning on EEG signals can be illustrated as  <inline-graphic xlink:href="ai_v2i1e46769_fig10.png" xlink:type="simple" mimetype="image"/>.</p>
          <p>For data augmentation, this study used bandpass filtering, noising, channel flipping, and shifting (see the definition in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and the visual illustrations in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). We conducted ablation studies on the augmentation methods in our experiment and have provided the implementation details. To reduce clutter, we also used <italic>z</italic> to denote the <italic>L2</italic> normalized version in the rest of the paper.</p>
        </sec>
        <sec>
          <title>ContraWR</title>
          <sec>
            <title>Background</title>
            <p>As mentioned above, most existing models use random samples as negative samples, which can introduce issues (that the negative sample might be from the same latent class) for the pretrain step and undermine representation quality. To address the issue, this paper proposes a new self-supervised learning method, ContraWR. ContraWR replaces the large number of negative samples with a single average representation of the batch, called the world representation or global representation. This way is robust as it avoids constructing negative pairs where 2 data are actually obtained from the same latent class. The world representation servers as a reference in our new contrastive principle: the representation similarity between a positive pair should be stronger than the similarity between the anchor and the world representation. Note that the world representation is not fixed but changes with the encoder updating the parameters.</p>
          </sec>
          <sec>
            <title>The World Representation</title>
            <p>Assume <italic>z</italic>’ is the anchor, <italic>z</italic>’’ is the positive sample, and <italic>z<sub>k</sub></italic> denotes a random sample. We generate an average representation of the data set, <italic>z<sub>w</sub></italic> as the only negative sample. To formalize, we assume <italic>k</italic>∼<italic>p</italic>(⋅) is the sample distribution over the data set (ie, <italic>k</italic> is the sample index), independent of the anchor <italic>z</italic>’. The world representation <italic>z<sub>w</sub></italic> is defined by <italic>z<sub>w</sub></italic>=<italic>E<sub>k</sub></italic><sub>∼</sub><italic><sub>p</sub></italic><sub>(⋅)</sub>[<italic>z<sub>k</sub></italic>].</p>
            <p>Here, we denote D=[z:||z||≤1, <italic>z</italic>∈R<italic><sup>m</sup></italic>]. Obviously, <italic>z<sub>w</sub></italic>∈<italic>D</italic>. In the experiment, <italic>z<sub>w</sub></italic> is approximated by the average over each batch; that is, we used the average sample representation over the batch  <inline-graphic xlink:href="ai_v2i1e46769_fig11.png" xlink:type="simple" mimetype="image"/> as the world representation, where <italic>M</italic> is the batch size.</p>
          </sec>
          <sec>
            <title>Gaussian Kernel Measure</title>
            <p>We adopted a Gaussian kernel defined on <italic>D</italic>, <italic>sim</italic>(<italic>x</italic>,<italic>y</italic>):<italic>D</italic>×<italic>D</italic>→(0,1] as a similarity measure. Formally, given 2 feature projections <italic>z</italic>’, <italic>z</italic>’’ the similarity is defined as  <inline-graphic xlink:href="ai_v2i1e46769_fig12.png" xlink:type="simple" mimetype="image"/>, where <italic>σ</italic> is a hyperparameter. The Gaussian kernel combined with the following triplet loss gives the alignment and uniformity properties in the loss convergence (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). When <italic>σ</italic> becomes large, the Gaussian kernel measure will reduce to cosine similarity.</p>
          </sec>
          <sec>
            <title>Loss Function</title>
            <p>For the anchor <italic>z</italic>’, the positive sample z’’ and the world representation <italic>z<sub>w</sub></italic>, we devise a triplet loss, <italic>L</italic>=[<italic>sim</italic>(<italic>z</italic>', <italic>z<sub>w</sub></italic>)+<italic>δ</italic>–<italic>sim</italic>(<italic>z</italic>', <italic>z</italic>'')]<sub>+</sub>, where <italic>δ</italic>&gt;0 is the empirical margin, a hyperparameter. The loss is minimized over batches, ensuring that the similarity of positive pairs <italic>sim</italic>(<italic>z</italic>’, <italic>z</italic>’’), is larger than the similarity to the world representation <italic>sim</italic>(<italic>z</italic>’, <italic>z<sub>w</sub></italic>), by a margin of <italic>δ</italic>.</p>
            <p>The pipeline of our ContraWR is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. The online networks <italic>f<sub>θ</sub></italic>(⋅), <italic>g<sub>θ</sub></italic>(⋅) and the target networks <italic>f<sub>ϕ</sub></italic>(⋅), <italic>g<sub>ϕ</sub></italic>(⋅) share an identical network structure. Encoder networks <italic>f<sub>θ</sub></italic>(⋅), <italic>f<sub>ϕ</sub></italic>(⋅) map 2 augmented versions of the same signal to respective feature representations. Then, the projection networks <italic>g<sub>θ</sub></italic>(⋅), <italic>g<sub>ϕ</sub></italic>(⋅) project the feature representations onto a unit hypersphere, where the loss is defined. During optimization, the web-based networks are updated by gradient descent, and the target networks update parameters from the online network with an exponential moving average (EMA) trick [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
            <p><italic>θ</italic><sup>(</sup><italic><sup>n</sup></italic><sup>+1)</sup>← <italic>θ</italic> <sup>(</sup><italic><sup>n</sup></italic><sup>)</sup>–<italic>η</italic>⋅∇<sub>θ</sub><bold><italic>L</italic></bold></p>
            <p><italic>ϕ</italic><sup>(</sup><italic><sup>n</sup></italic><sup>+1)</sup>←<italic>λ</italic>⋅<italic>ϕ</italic><sup>(</sup><italic><sup>n</sup></italic><sup>)</sup>+(1–<italic>λ</italic>)⋅<italic>θ</italic><sup>(</sup><italic><sup>n</sup></italic><sup>+1)</sup></p>
            <p>where <italic>n</italic> indicates the <italic>n</italic>th update, <italic>η</italic> is the learning rate, and <italic>λ</italic> is a weight hyperparameter. After this optimization in the pretrain step, the encoder network <italic>f<sub>θ</sub></italic>(⋅) is ready to be evaluated on the training and test sets in the supervised step.</p>
            <fig id="figure1" position="float">
              <label>Figure 1</label>
              <caption>
                <p>The Contrast with the World Representation (ContraWR) model pipeline. We show the 2-way model pipeline in this figure. The web-based network (upper) is updated by gradient descent, while the target network (lower) is updated by the exponential moving average. Finally, the results of the 2 models form the triplet loss function.</p>
              </caption>
              <graphic xlink:href="ai_v2i1e46769_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
        </sec>
        <sec>
          <title>ContraWR+: Contrast With Instance-Aware World Representation</title>
          <sec>
            <title>Background</title>
            <p>To learn a better representation, we introduced a weighted averaged world representation based on the harder principle: the similarity between a positive pair should be stronger than the similarity between the anchor and the weighted average feature representations of the data set, where the weight is set higher for closer samples. We call the new model ContraWR+. This is a more difficult objective than the simple global average in ContraWR.</p>
          </sec>
          <sec>
            <title>Instance-Aware World Representation</title>
            <p>In this new model, the world representation is enhanced by modifying the sampling distribution to be instance-specific. We define <italic>p</italic>(⋅|<italic>z</italic>) as the instance-aware sampling distribution of an anchor <italic>z</italic>, which is different from the sample distribution <italic>p</italic>(⋅) used in ContraWR,  <inline-graphic xlink:href="ai_v2i1e46769_fig13.png" xlink:type="simple" mimetype="image"/>, where <italic>T</italic>&gt;0 is a temperature hyperparameter, such that similar samples are selected with higher probability parametrized by <italic>p</italic>(⋅|<italic>z</italic>). Consequently, for an anchor <italic>z</italic>’, the instance-aware world representation becomes  <inline-graphic xlink:href="ai_v2i1e46769_fig14.png" xlink:type="simple" mimetype="image"/>.</p>
            <p>Here, <italic>T</italic> controls the contrastive hardness of the world representation. When <italic>T</italic>→∞, <italic>p</italic>(⋅|<italic>z</italic>) is asymptotically identical to <italic>p</italic>(⋅), and the above equation reduces to the simple global average form <italic>z<sub>w</sub></italic>=<italic>E<sub>k</sub></italic><sub>∼</sub><italic><sub>p</sub></italic><sub>(⋅)</sub>[<italic>z<sub>k</sub></italic>]; while <italic>T</italic>→0<sup>+</sup>, the form becomes trivial, <italic>z<sub>w</sub></italic>=<italic>argmax<sub>zk</sub></italic>(<italic>sim</italic>(<italic>z</italic>', <italic>z<sub>k</sub></italic>)))). We have tested different <italic>T</italic> and found that the model is not sensitive to <italic>T</italic> over a wide range. Here, <italic>z<sub>w</sub></italic> is also practically implemented by using the weighted average over each batch. We can rewrite the similarity measure given the anchor <italic>z<sub>i</sub></italic> and the new world representation <italic>z<sub>w</sub></italic> as:</p>
            <p><italic>sim</italic>(<italic>z<sub>i</sub></italic>, <italic>z<sub>w</sub></italic>)=<italic>sim</italic>(<italic>z</italic>', <italic>E<sub>k</sub></italic><sub>∼</sub><italic><sub>p</sub></italic><sub>(⋅|z')</sub>[<italic>z<sub>k</sub></italic>])</p>
            <disp-formula>
              <graphic xlink:href="ai_v2i1e46769_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
            <p>In this new method, we also used triplet loss as the final objective.</p>
          </sec>
        </sec>
        <sec>
          <title>Implementations</title>
          <sec>
            <title>Signal Augmentation</title>
            <p>For the experiments, we used four augmentation methods, illustrated in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>: (1) bandpass filtering: to reduce noise, we used an order-1 Butterworth filter (the bandpass is specified in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>); (2) noising: we added extra high- or low-frequency noise to each channel, mimicking the physical distortion; (3) channel flipping: corresponding sensors from the left side and the right of the head were swapped due to symmetricity; and (4) shifting: within one sample, we advanced or delayed the signal for a certain time span. Detailed configurations of augmentation methods vary for the 3 data sets, and we have listed them in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
          </sec>
          <sec>
            <title>Baseline Methods</title>
            <p>In the experiments, several recent self-supervised learning methods were implemented for comparison.</p>
            <p>MoCo [<xref ref-type="bibr" rid="ref2">2</xref>] devises 2 parallel encoders with an EMA. It also uses a large memory table to store new negative samples, which are frequently updated.</p>
            <p>SimCLR [<xref ref-type="bibr" rid="ref3">3</xref>] uses an encoder network to generate both anchor and positive samples, where negative samples are collected from the same batch.</p>
            <p>BYOL [<xref ref-type="bibr" rid="ref7">7</xref>] also uses 2 encoders: a web-based network and a target network. They put one more predictive layer on top of the web-based network to predict (reconstruct) the result from the target network, while no negative samples are presented.</p>
            <p>SimSiam [<xref ref-type="bibr" rid="ref8">8</xref>] uses the same encoder networks on 2 sides and also does not use the negative samples.</p>
            <p>Average k-nearest neighbor TopX is our developed baseline model, which identifies the top X nearest neighbors for each sample within the batch and uses the average representation of these top X neighbors as the negative sample. We used the same triplet loss as our ContraWR model. In the experiments, we tested X=1, X=5, and X=50. When X approaches the batch size, this model will gradually reduce to ContraWR.</p>
          </sec>
          <sec>
            <title>Model Architecture</title>
            <p>For a fair comparison, all models, including baseline approaches and our models, use the same augmentation and encoder architecture, as shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>. This architecture cascades a short-time Fourier transform (STFT) operation, a 2D convolutional neural network layer, and three 2D convolutional blocks. Empirically, we found that the application of neural networks generates better accuracy on the STFT spectrogram of the signals than on the raw signals. The same practices were reported by Yang et al [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p>
            <p>We also considered a supervised model (called <italic>Supervised</italic>) as a reference model, which uses the same encoder architecture and adds a 2-layer fully connected network (128, 256, and 192 units for the Sleep EDF, SHHS, and MGH data sets, respectively) for the sleep staging classification task. The supervised model does not use the pretrain set but is trained from scratch on raw EEG signals in the training set and tested on the test set. We also included an untrained encoder model as a baseline, where the encoder was initialized but not optimized in the pretrain step.</p>
            <fig id="figure2" position="float">
              <label>Figure 2</label>
              <caption>
                <p>The short-time Fourier transform (STFT) convolutional encoder network. The encoder network first transforms raw signals into spectrogram via STFT, and then a convolutional neural network–based encoder is built on top of the spectrogram. ELU: exponential linear unit; FFT: Fast Fourier Transform; Conv.:convolution operation.</p>
              </caption>
              <graphic xlink:href="ai_v2i1e46769_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </fig>
          </sec>
        </sec>
        <sec>
          <title>Evaluation Protocol</title>
          <p>We evaluated performance on the sleep staging task with overall 5-class classification accuracy. Each experiment was conducted with 5 different random seeds. For self-supervised methods, we optimized the encoder for 100 epochs (here, “epoch” is a concept in deep learning) with unlabeled data, used the training set to find a good logistic classifier, and used the test set data for evaluation in accordance with He et al [<xref ref-type="bibr" rid="ref2">2</xref>] and Chen et al [<xref ref-type="bibr" rid="ref3">3</xref>]. For the supervised method, we trained the model for 100 epochs on the training set. Our setting ensures the convergence of all models.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Better Accuracy in Sleep Staging</title>
        <p>Comparisons on the downstream sleep staging task are shown in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
        <p>All self-supervised methods outperformed the untrained encoder model, indicating that the pretrain step does learn some useful features from unlabeled data. We observed that ContraWR and ContraWR+ both outperform the supervised model, suggesting that the feature representations provided by the encoder can better preserve the predictive features and filter out noises than using the raw signals for the sleep staging task, in cases when the amount of labeled data available are not sufficient (eg, less than 2% in Sleep EDF). Compared to other self-supervised methods, our proposed model ContraWR+ also provided better predictive accuracy; that is, about 1.3% on Sleep EDF, 0.8% on SHHS, 1.3% on MGH Sleep. The performance improvements were mostly significant (<italic>P</italic>&lt;.001; comparing MoCo vs Sleep EDF data sets, <italic>P</italic>=.002). MGH Sleep data contain more noise than the other 2 data sets (reflected by the relatively low accuracy with the supervised model on raw signals). Performance gain was notably much more significant on MGH over other self-supervised or supervised models (about 3.3% relative improvement on accuracy), which suggests that the proposed models handle noisy environments better.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Comparison of sleep staging accuracy with different methods.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="300"/>
            <col width="250"/>
            <col width="250"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Name</td>
                <td colspan="3">Sleep staging accuracy (%), mean (SD)<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sleep EDF data set</td>
                <td>Sleep Heart Health Study data set</td>
                <td>MGH<sup>b</sup> Sleep data set</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Supervised</td>
                <td>84.98 (0.3562)</td>
                <td>75.61 (0.9347)</td>
                <td>69.73 (0.4324)</td>
              </tr>
              <tr valign="top">
                <td>Untrained Encoder</td>
                <td>77.83 (0.0232)</td>
                <td>60.03 (0.0448)</td>
                <td>55.64 (0.0082)</td>
              </tr>
              <tr valign="top">
                <td>MoCo<sup>c</sup></td>
                <td>85.58 (0.7707)</td>
                <td>77.10 (0.2743)</td>
                <td>62.14 (0.7099)</td>
              </tr>
              <tr valign="top">
                <td>SimCLR<sup>d</sup></td>
                <td>83.79 (0.3532)</td>
                <td>76.61 (0.3007)</td>
                <td>67.32 (0.7749)</td>
              </tr>
              <tr valign="top">
                <td>BYOL<sup>e</sup></td>
                <td>85.61 (0.7080)</td>
                <td>76.64 (0.3783)</td>
                <td>70.75 (0.1461)</td>
              </tr>
              <tr valign="top">
                <td>SimSiam<sup>f</sup></td>
                <td>84.78 (0.8028)</td>
                <td>74.25 (0.4796)</td>
                <td>62.08 (0.4902)</td>
              </tr>
              <tr valign="top">
                <td>AVG-KNN-Top1<sup>g</sup></td>
                <td>80.39 (1.3721)</td>
                <td>69.70 (0.8944)</td>
                <td>60.73 (0.7423)</td>
              </tr>
              <tr valign="top">
                <td>AVG-KNN-Top5</td>
                <td>83.24 (0.6182)</td>
                <td>75.18 (0.7845)</td>
                <td>69.14 (0.3393)</td>
              </tr>
              <tr valign="top">
                <td>AVG-KNN-Top50</td>
                <td>86.35 (0.3246)</td>
                <td>77.63 (0.3625)</td>
                <td>71.95 (0.3482)</td>
              </tr>
              <tr valign="top">
                <td>ContraWR<sup>h</sup></td>
                <td>85.94 (0.2326)</td>
                <td>77.52 (0.5748)</td>
                <td>71.97 (0.1774)</td>
              </tr>
              <tr valign="top">
                <td>ContraWR+</td>
                <td>86.90 (0.2288)</td>
                <td>77.97 (0.2693)</td>
                <td>72.03 (0.1823)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Calculated over 5 random seeds.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>MGH: Massachusetts General Hospital.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>MoCo: Momentum Control.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>SimCLR: simple framework for contrastive learning of visual representations.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>BYOL: Bootstrap Your Own Latent.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>SimSam: simple Siamese.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>AVG-KNN-TopX: average k-nearest neighbor TopX.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>ContraWR: Contrast with the World Representation.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ablation Study on Data Augmentations</title>
        <p>We also inspected the effectiveness of different augmentation methods on EEG signals, shown in <xref ref-type="table" rid="table4">Table 4</xref>.</p>
        <p>We empirically test all possible combinations of 4 considered augmentations: channel flipping, bandpass filtering, noising, and shifting. Since channel flipping cannot be applied by itself, we combined it with other augmentations. The evaluation was conducted on Sleep EDF data with the ContraWR+ model. To sum up, all augmentation methods are beneficial, and collectively, they can further boost the classification performance.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Evaluation accuracy of different augmentations.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Augmentations</td>
                <td>Accuracy (%), mean (SD)<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Bandpass</td>
                <td>84.23 (0.2431)</td>
              </tr>
              <tr valign="top">
                <td>Noising</td>
                <td>83.60 (0.1182)</td>
              </tr>
              <tr valign="top">
                <td>Shifting</td>
                <td>84.65 (0.2844)</td>
              </tr>
              <tr valign="top">
                <td>Bandpass + flipping</td>
                <td>85.77 (0.2337)</td>
              </tr>
              <tr valign="top">
                <td>Noising + flipping</td>
                <td>84.45 (0.1420)</td>
              </tr>
              <tr valign="top">
                <td>Shifting + flipping</td>
                <td>85.13 (0.0558)</td>
              </tr>
              <tr valign="top">
                <td>Bandpass + noising</td>
                <td>85.37 (0.1214)</td>
              </tr>
              <tr valign="top">
                <td>Noising + shifting</td>
                <td>84.78 (0.1932)</td>
              </tr>
              <tr valign="top">
                <td>Shifting + bandpass</td>
                <td>85.25 (0.1479)</td>
              </tr>
              <tr valign="top">
                <td>Bandpass + noising + flipping</td>
                <td>85.76 (0.1794)</td>
              </tr>
              <tr valign="top">
                <td>Noising + shifting + flipping</td>
                <td>85.17 (0.2301)</td>
              </tr>
              <tr valign="top">
                <td>Shifting + bandpass + flipping</td>
                <td>86.38 (0.2789)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Calculated over 5 random seeds.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Varying Amount of Training Data</title>
        <p>To further investigate the benefits of self-supervised learning, we evaluated the effectiveness of the learned feature representations with varying training data on Sleep EDF (<xref rid="figure3" ref-type="fig">Figure 3</xref>). The default setting is to split all the data into pretrain, training, or test sets by 90%: 5%: 5%. In this section, we maintained the 5% test set constant and resplit the pretrain and training sets (after resplitting, we ensured that all the training set data have labels and removed the labels from the pretrain set), such that the training proportion becomes 0.5%, 1%, 2%, 5%, and 10%, and the rest is used for the pretrain set. This resplitting was conducted at the subject level, after which we again segmented each subject’s recording within the pretrain or training set. We compared our ContraWR+ model to MoCo, SimCLR, BYOL, SimSiam, and the supervised baseline models. Similar ablation studies on SHHS and MGH can be found in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. Our model outperforms the compared models consistently with different amounts of training data. For example, our model achieves similar performance (with only 5% data as training) to that of the best baseline, BYOL, which needs twice the amount of training data (10% data as training). Also, compared to the supervised model, the self-supervised methods performed better when the labels were insufficient; for example, only ≤2% of the data were labeled.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Model performance with different amounts of training data (on the Sleep EDF data set). The curves indicate mean values and shaded areas show the SD of the training/test over 5 random seeds. All models have the same encoder network architecture. For the self-supervised method, we trained a logistic regression model on top of the frozen encoder with the training set, and for the supervised model, we trained the encoder along with the final nonlinear classification layer from scratch with the training set. The proportion of training data is 0.5%, 1%, 2%, 5%, and 10%. Each configuration runs with 5 different random seeds and the error bars indicate the SD over 5 seeds. BYOL: Bootstrap Your Own Latent; MoCo: Momentum Contrast; SimCLR: simple framework for contrastive learning of visual representations; SimSiam: simple Siamese.</p>
          </caption>
          <graphic xlink:href="ai_v2i1e46769_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Representation Projection</title>
        <p>We next sought to assess the quality of the learned feature representations. To do this, we used the representations produced by ContraWR+ on the MGH data set and randomly selected 5000 signal epochs per label from the data set. The ContraWR+ encoder is optimized on the pretrain step without using the labels. We extracted feature representations for each sample through the encoder network and used uniform manifold approximation and projection (UMAP) [<xref ref-type="bibr" rid="ref16">16</xref>] to project onto the 2D space. We finally color-coded samples according to sleep stage labels for illustration.</p>
        <p>The 2D projection is shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>. We also computed the confusion matrix from the evaluation stage (based on the test set; also shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>). In the UMAP projection, epochs from the same latent class are closely colocated, which implies that the pretrain step extracts important information for sleep stage classification from the raw unlabeled EEG signals. Stage N1 overlaps with stages W, N2, and N3, which is as expected given that N1 is often ambiguous and thus difficult to classify even for well-trained experts [<xref ref-type="bibr" rid="ref1">1</xref>].</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Uniform manifold approximation and projection and confusion matrix. (A) Using the Massachusetts General Hospital’s (MGH’s) MGH Sleep data set, we projected the output representations of each signal into a 2D space and color by the actual labels. (B) We have included a confusion matrix on sleep staging.</p>
          </caption>
          <graphic xlink:href="ai_v2i1e46769_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Hyperparameter Ablation Study</title>
        <p>To investigate the sensitivity of our model to hyperparameter settings, we tested with different batch sizes and trained on different values for the Gaussian parameter <italic>σ</italic>, temperature <italic>T</italic>, and margin <italic>δ</italic>. We focused on the ContraWR+ model and evaluated it on the Sleep EDF data set. During the experiment, the default settings are a batch size of 256, <italic>σ</italic> of 2, <italic>T</italic> of 2, <italic>δ</italic> of 0.2, learning rate <italic>η</italic> of 2×10<sup>–4</sup>, weight decay of 10<sup>–4</sup>, and epoch of 100. When testing on 1 hyperparameter, others are maintained constant.</p>
        <p>The ablation study’s results are in shown in <xref rid="figure5" ref-type="fig">Figure 5</xref>; the red star indicates the default configuration. Each configuration runs with 5 different random seeds, and the error bars indicate the SD over 5 experiments. We see that the model is not sensitive to batch size. We see that over a large range (&lt;10) the model is insensitive to the Gaussian width <italic>σ</italic>. For temperature <italic>T</italic>, we noted previously that a very small <italic>T</italic> may be problematic, and a very large <italic>T</italic> reduces ContraWR+ to ContraWR. Based on the ablation experiments, the performance is relatively insensitive to choices of <italic>T</italic>. For the margin <italic>δ</italic>, the difference in distance is bounded (given a fixed <italic>σ</italic> of 2):</p>
        <disp-formula>
          <graphic xlink:href="ai_v2i1e46769_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Thus, <italic>δ</italic> should be large enough; that is, <italic>δ</italic>≥0.1.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Ablation study on batch size and 3 hyperparameters. The curves indicate the mean values and shaded areas show the SD of training/test over 5 random seeds. The red star denotes the default setting. It is obvious that with a larger batch size, the model will perform better, but it is not sensitive to all hyperparameters.</p>
          </caption>
          <graphic xlink:href="ai_v2i1e46769_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study has been approved by the Institutional Review Board of Beth Israel Deaconess Medical Center (BIDMC IRB protocol #2022P000417 [Brain Informatics Database]).</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Results</title>
        <p>Our proposed ContraWR and ContraWR+ models outperformed 4 recent self-supervised learning methods on the sleep staging task across 3 large EEG data sets (<italic>P</italic>&lt;.001 in almost all cases). ContraWR+ also superseded supervised learning when fewer training labels were available (eg, a 4% improvement in accuracy when less than 2% of data were labeled). Moreover, the models provided well-separated representative structures in 2D projection.</p>
      </sec>
      <sec>
        <title>Comparison With Prior Work</title>
        <sec>
          <title>Self-Supervised Learning</title>
          <p>Many deep generative methods have been proposed for unsupervised representation learning. They mostly rely on autoencoding [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>] or adversarial training [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. Mutual information maximization is also popular for compressing input data into a latent representation [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>].</p>
          <p>Recently, self-supervised contrastive learning [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] has become popular, where loss functions are devised from representation similarity and negative sampling. However, one recent study [<xref ref-type="bibr" rid="ref4">4</xref>] highlighted inherent limitations of negative sampling and showed that this strategy could hurt the learned representation significantly [<xref ref-type="bibr" rid="ref5">5</xref>]. To address these limitations, Chuang et al [<xref ref-type="bibr" rid="ref5">5</xref>] used the law of total probability and approximated the per-class negative sample distribution using the weighted sum of the global data distribution and the expected class label distribution. However, without the actual labels, the true class label distribution is unknown. Grill et al [<xref ref-type="bibr" rid="ref7">7</xref>] and Chen and He [<xref ref-type="bibr" rid="ref8">8</xref>] proposed ignoring negative samples and learning latent representations using only positive pairs.</p>
          <p>In this paper, we leverage the negative information by replacing negative samples with the average representation of the batch samples (ie, the world representation). We argue and provide experiments showing that contrasting with the world representation is more powerful and robust in the noisy EEG setting.</p>
        </sec>
        <sec>
          <title>EEG Sleep Staging</title>
          <p>Before the emergence of deep learning, several traditional machine learning approaches [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>] significantly advanced the field using hand-crafted features, as highlighted by Biswal et al [<xref ref-type="bibr" rid="ref29">29</xref>]. Recently, deep learning models have been applied to various large sleep databases. SLEEPNET [<xref ref-type="bibr" rid="ref29">29</xref>] built a comprehensive system combining many machine learning models to learn sleep signal representations. Biswal et al [<xref ref-type="bibr" rid="ref1">1</xref>] designed a multilayer recurrent and convolutional neural network model to process multichannel signals from EEG. To provide interpretable stage prototypes, Al-Hussaini et al [<xref ref-type="bibr" rid="ref30">30</xref>] developed a SLEEPER model that uses a particular deep learning approach called prototype learning guided by a decision tree to provide more interpretable results. These studies rely on a large set of labeled training data. However, the annotations are expensive, and oftentimes the labeled set is small. In this study, we exploited the large set of unlabeled data to improve the classification, which is more challenging.</p>
        </sec>
        <sec>
          <title>Self-Supervised Learning on Physiological Signals</title>
          <p>While image [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>], video [<xref ref-type="bibr" rid="ref33">33</xref>], language [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>], and speech [<xref ref-type="bibr" rid="ref36">36</xref>] representations have benefited from contrastive learning, research on learning physiological signals has been limited [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Lemkhenter et al [<xref ref-type="bibr" rid="ref39">39</xref>] proposed phase and amplitude coupling for physiological data augmentation. Banville et al [<xref ref-type="bibr" rid="ref40">40</xref>] conducted representation learning on EEG signals, and they targeted monitoring and pathology screening tasks, without using frequency information. Cheng et al [<xref ref-type="bibr" rid="ref41">41</xref>] learned subject-aware representations for electrocardiography data and tested various augmentation methods. While most of these methods are based on pairwise similarity comparison, our model provides contrastive information from global data statistics, providing more robust representations. Also, we extracted signal information from the spectral domain.</p>
        </sec>
      </sec>
      <sec>
        <title>Strengths and Limitations</title>
        <p>The strengths of our study are (1) we used 3 real-world data sets collected from different institutes and across different year ranges, and 2 are publicly available; (2) our PSG recordings are diverse and generalizable, including 2 data sets collected at home and 1 collected in the laboratory setting, all having relatively large sizes; (3) we have open-sourced our data processing pipelines and all programs used for his study [<xref ref-type="bibr" rid="ref42">42</xref>], including the baseline model implementations; and (4) we proposed new data augmentation methods for PSG signals and have systematically evaluated their effectiveness. However, the following limitations of our study should be noted: (1) we fixed the neural network encoder architecture in the study, which we plan to explore using other models including recurrent neural networks in the future; (2) we have used STFT to extract spectrograms, but we may consider alternative techniques such as wavelet transformation in future; and (3) our current data augmentation methods are based on clinical knowledge, and we aim to investigate data-driven approaches to design more effective methods in the future.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study is motivated by the need to learn effective EEG representations from large unlabeled noisy EEG data sets. We propose a self-supervised contrastive method, ContraWR, and its enhanced variant, ContraWR+. Instead of creating a large number of negative samples, our method contrasts samples with an average representation of many samples. The model is evaluated on a downstream sleep staging task with 3 real-world EEG data sets. Extensive experiments show that the model is more powerful and robust than multiple baselines including MoCo, SimCLR, BYOL, and SimSiam. ContraWR+ also outperforms the supervised counterpart in label-insufficient scenarios.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary on model implementation.</p>
        <media xlink:href="ai_v2i1e46769_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 168 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Illustration for data augmentations (bandpass filtering, noising, flipping, and shifting).</p>
        <media xlink:href="ai_v2i1e46769_app2.png" xlink:title="PNG File , 368 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Theoretical loss boundness analysis.</p>
        <media xlink:href="ai_v2i1e46769_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 562 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Results on SHHS and MGH data set during varying the label sizes.</p>
        <media xlink:href="ai_v2i1e46769_app4.pdf" xlink:title="PDF File  (Adobe PDF File), 24 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AASM</term>
          <def>
            <p>American Academy of Sleep Medicine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BYOL</term>
          <def>
            <p>Bootstrap Your Own Latent</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">ContraWR</term>
          <def>
            <p>Contrast with the World Representation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EEG</term>
          <def>
            <p>electroencephalogram</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">EMA</term>
          <def>
            <p>exponential moving average</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MGH</term>
          <def>
            <p>Massachusetts General Hospital</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">MoCo</term>
          <def>
            <p>Momentum Contrast</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">PSG</term>
          <def>
            <p>polysomnography</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">REM</term>
          <def>
            <p>rapid eye movement</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">SHHS</term>
          <def>
            <p>Sleep Heart Health Study</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">SimCLR</term>
          <def>
            <p>simple framework for contrastive learning of visual representations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">SimSiam</term>
          <def>
            <p>simple Siamese</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">STFT</term>
          <def>
            <p>short-time Fourier transform</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">UMAP</term>
          <def>
            <p>uniform manifold approximation and projection</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was in part supported by the National Science Foundation (awards SCH-2014438, IIS-1418511, CCF-1533768, and IIS-1838042), the National Institute of Health (R01NS107291, R56HL138415, 1R01NS102190, 1R01NS102574, and RF1AG064312), the Glenn Foundation for Medical Research and the American Federation for Aging Research (Breakthroughs in Gerontology Grant), and the American Academy of Sleep Medicine (AASM Foundation Strategic Research Award).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>CY implemented the methods and conducted the experiments. All authors were involved in conceptualizing the study and drafting the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>MBW is the cofounder of Beacon Biosignals, which played no role in this study.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Biswal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Goparaju</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Westover</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bianchi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Expert-level sleep scoring with deep neural networks</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>12</month>
          <day>01</day>
          <volume>25</volume>
          <issue>12</issue>
          <fpage>1643</fpage>
          <lpage>1650</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30445569"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocy131</pub-id>
          <pub-id pub-id-type="medline">30445569</pub-id>
          <pub-id pub-id-type="pii">5185596</pub-id>
          <pub-id pub-id-type="pmcid">PMC6289549</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Girshick</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Momentum Contrast for Unsupervised Visual Representation Learning</article-title>
          <year>2020</year>
          <conf-name>2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>June 13-19, 2020</conf-date>
          <conf-loc>Seattle, WA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr42600.2020.00975</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kornblith</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Norouzi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A simple framework for contrastive learning of visual representations</article-title>
          <source>arXiv</source>
          <comment> Preprint posted online February 13, 2020</comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arora</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Khandeparkar</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Khodak</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Plevrakis</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Saunshi</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>A Theoretical Analysis of Contrastive Unsupervised Representation Learning</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online February 25, 2019</comment>
          <pub-id pub-id-type="doi">10.1090/mbk/121/79</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chuang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Torralba</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jegelka</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Debiased Contrastive Learning</article-title>
          <year>2020</year>
          <conf-name>34th Conference on Neural Information Processing Systems (NeurIPS2020)</conf-name>
          <conf-date>2020</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Robinson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chuang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jegelka</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Contrastive learning with hard negative samples</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online October 9, 2020</comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grill</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Strub</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Altché</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tallec</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Richemond</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Buchatskaya</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Doersch</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Pires</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Azar</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Piot</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kavukcuoglu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Munos</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Valko</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Bootstrap Your Own Latent - A New Approach to Self-Supervised Learning</article-title>
          <year>2020</year>
          <conf-name>34th Conference on Neural Information Processing Systems (NeurIPS2020)</conf-name>
          <conf-date>2020</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Exploring Simple Siamese Representation Learning</article-title>
          <year>2021</year>
          <conf-name>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>June 20-25, 2021</conf-date>
          <conf-loc>Nashville, TN</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr46437.2021.01549</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mueller</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rueschman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mariani</surname>
              <given-names>Sara</given-names>
            </name>
            <name name-style="western">
              <surname>Mobley</surname>
              <given-names>Daniel</given-names>
            </name>
            <name name-style="western">
              <surname>Redline</surname>
              <given-names>Susan</given-names>
            </name>
          </person-group>
          <article-title>The National Sleep Research Resource: towards a sleep data commons</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>10</month>
          <day>01</day>
          <volume>25</volume>
          <issue>10</issue>
          <fpage>1351</fpage>
          <lpage>1358</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29860441"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocy064</pub-id>
          <pub-id pub-id-type="medline">29860441</pub-id>
          <pub-id pub-id-type="pii">5026200</pub-id>
          <pub-id pub-id-type="pmcid">PMC6188513</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Iber</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kiley</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nieto</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rapoport</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Redline</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Robbins</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Samet</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Wahl</surname>
              <given-names>PW</given-names>
            </name>
          </person-group>
          <article-title>The Sleep Heart Health Study: design, rationale, and methods</article-title>
          <source>Sleep</source>
          <year>1997</year>
          <volume>20</volume>
          <issue>12</issue>
          <fpage>1077</fpage>
          <lpage>1085</lpage>
          <pub-id pub-id-type="doi">10.1093/sleep/20.12.1077</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kemp</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zwinderman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tuk</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kamphuisen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Oberyé</surname>
              <given-names>J J</given-names>
            </name>
          </person-group>
          <article-title>Analysis of a sleep-dependent neuronal feedback loop: the slow-wave microcontinuity of the EEG</article-title>
          <source>IEEE Trans Biomed Eng</source>
          <year>2000</year>
          <month>09</month>
          <volume>47</volume>
          <issue>9</issue>
          <fpage>1185</fpage>
          <lpage>1194</lpage>
          <pub-id pub-id-type="doi">10.1109/10.867928</pub-id>
          <pub-id pub-id-type="medline">11008419</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berry</surname>
              <given-names>RB</given-names>
            </name>
            <name name-style="western">
              <surname>Brooks</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gamaldo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Harding</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Lloyd</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Quan</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Troester</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Vaughn</surname>
              <given-names>BV</given-names>
            </name>
          </person-group>
          <article-title>AASM Scoring Manual Updates for 2017 (Version 2.4)</article-title>
          <source>J Clin Sleep Med</source>
          <year>2017</year>
          <month>05</month>
          <day>15</day>
          <volume>13</volume>
          <issue>5</issue>
          <fpage>665</fpage>
          <lpage>666</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28416048"/>
          </comment>
          <pub-id pub-id-type="doi">10.5664/jcsm.6576</pub-id>
          <pub-id pub-id-type="medline">28416048</pub-id>
          <pub-id pub-id-type="pii">jc-17-00167</pub-id>
          <pub-id pub-id-type="pmcid">PMC5406946</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gutmann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hyvärinen</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Noise-contrastive estimation: A new estimation principle for unnormalized statistical models</article-title>
          <year>2010</year>
          <conf-name>13th International Conference on Artificial Intelligence and Statistics (AISTATS) 2010</conf-name>
          <conf-date>2010</conf-date>
          <conf-loc>Sardinia, Italy</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Westover</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Solomonik</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>ATD: augmenting CP tensor decomposition by self supervision</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online June 15, 2021</comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Westover</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>ManyDG: many-domain generalization for healthcare applications</article-title>
          <source>arXiv</source>
          <comment> Preprint posted online January 21, 2023</comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Healy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Saul</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Großberger</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>UMAP: Uniform Manifold Approximation and Projection</article-title>
          <source>JOSS</source>
          <year>2018</year>
          <month>09</month>
          <volume>3</volume>
          <issue>29</issue>
          <fpage>861</fpage>
          <pub-id pub-id-type="doi">10.21105/joss.00861</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Larochelle</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Vincent</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lajoie</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Manzagol</surname>
              <given-names>P-A</given-names>
            </name>
          </person-group>
          <article-title>Stacked denoising autoencoders: learning useful representations in a deep network with a local denoising criterion</article-title>
          <source>J Mach Learn Res</source>
          <year>2010</year>
          <volume>11</volume>
          <fpage>3371</fpage>
          <lpage>3408</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baldi</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Autoencoders, Unsupervised Learning, and Deep Architectures</article-title>
          <year>2012</year>
          <conf-name>ICML Workshop on Unsupervised and Transfer Learning</conf-name>
          <conf-date>2012</conf-date>
          <conf-loc>Bellevue, WA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Auto-encoding variational bayes</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online December 20, 2013</comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Donahue</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Krähenbühl</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Darrell</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Adversarial feature learning</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online May 31, 2016</comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodfellow</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Pouget-Abadie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mirza</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Warde-Farley</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ozair</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Courville</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Generative adversarial networks</article-title>
          <source>Commun ACM</source>
          <year>2020</year>
          <volume>63</volume>
          <issue>11</issue>
          <fpage>139</fpage>
          <lpage>144</lpage>
          <pub-id pub-id-type="doi">10.1145/3422622</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shrivastava</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pfister</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tuzel</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Susskind</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Webb</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Learning from Simulated and Unsupervised Images through Adversarial Training</article-title>
          <year>2017</year>
          <conf-name>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>July 21-26, 2017</conf-date>
          <conf-loc>Honolulu, HI</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2017.241</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hjelm</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Fedorov</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lavoie-Marchildon</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Grewal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bachman</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Trischler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Learning deep representations by mutual information estimation and maximization</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online August 20, 2018</comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tschannen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Djolonga</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rubenstein</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gelly</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lucic</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>On mutual information maximization for representation learning</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online July 31, 2019</comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bachman</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hjelm</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Buchwalter</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Learning representations by maximizing mutual information across views</article-title>
          <year>2019</year>
          <conf-name>33rd Conference on Neural Information Processing Systems (NeurIPS2019)</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Vancouver, BC</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fraiwan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lweesy</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Khasawneh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fraiwan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wenz</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dickhaus</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Classification of sleep stages using multi-wavelet time frequency entropy and LDA</article-title>
          <source>Methods Inf Med</source>
          <year>2018</year>
          <month>01</month>
          <day>17</day>
          <volume>49</volume>
          <issue>03</issue>
          <fpage>230</fpage>
          <lpage>237</lpage>
          <pub-id pub-id-type="doi">10.3414/me09-01-0054</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Anderer</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Moreau</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Woertz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gruber</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Parapatics</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Loretz</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Heller</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Boeck</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moser</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kloesch</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Saletu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Saletu-Zyhlarz</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Danker-Hopfe</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zeitlhofer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dorffner</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Computer-assisted sleep classification according to the standard of the American Academy of Sleep Medicine: validation study of the AASM version of the Somnolyzer 24 × 7</article-title>
          <source>Neuropsychobiology</source>
          <year>2010</year>
          <month>9</month>
          <day>9</day>
          <volume>62</volume>
          <issue>4</issue>
          <fpage>250</fpage>
          <lpage>264</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1159/000320864"/>
          </comment>
          <pub-id pub-id-type="doi">10.1159/000320864</pub-id>
          <pub-id pub-id-type="medline">20829636</pub-id>
          <pub-id pub-id-type="pii">000320864</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berthomier</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Drouot</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Herman-Stoïca</surname>
              <given-names>Maria</given-names>
            </name>
            <name name-style="western">
              <surname>Berthomier</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Prado</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bokar-Thire</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Benoit</surname>
              <given-names>Odile</given-names>
            </name>
            <name name-style="western">
              <surname>Mattout</surname>
              <given-names>Jérémie</given-names>
            </name>
            <name name-style="western">
              <surname>d'Ortho</surname>
              <given-names>Marie-Pia</given-names>
            </name>
          </person-group>
          <article-title>Automatic analysis of single-channel sleep EEG: validation in healthy individuals</article-title>
          <source>Sleep</source>
          <year>2007</year>
          <month>11</month>
          <volume>30</volume>
          <issue>11</issue>
          <fpage>1587</fpage>
          <lpage>1595</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/18041491"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/sleep/30.11.1587</pub-id>
          <pub-id pub-id-type="medline">18041491</pub-id>
          <pub-id pub-id-type="pmcid">PMC2082104</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Biswal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kulas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Goparaju</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Westover</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bianchi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>SLEEPNET: automated sleep staging system via deep learning</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online July 26, 2017</comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Hussaini</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Westover</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>SLEEPER: interpretable sleep staging via prototypes from expert rules</article-title>
          <year>2019</year>
          <conf-name>4th Machine Learning for Healthcare Conference</conf-name>
          <conf-date>2019</conf-date>
          <conf-loc>Ann Arbor, MI</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schroff</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kalenichenko</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Philbin</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>FaceNet: A unified embedding for face recognition and clustering</article-title>
          <year>2015</year>
          <conf-name>2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>
          <conf-date>June 07-12, 2015</conf-date>
          <conf-loc>Boston, MA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/cvpr.2015.7298682</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jing</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Self-supervised visual feature learning with deep neural networks: a survey</article-title>
          <source>IEEE Trans Pattern Anal Mach Intell</source>
          <year>2021</year>
          <month>11</month>
          <day>1</day>
          <volume>43</volume>
          <issue>11</issue>
          <fpage>4037</fpage>
          <lpage>4058</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2020.2992393</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Self-supervised video representation learning by pace prediction</article-title>
          <year>2020</year>
          <conf-name>16th European Conference on Computer Vision</conf-name>
          <conf-date>August 23-28, 2020</conf-date>
          <conf-loc>Glasgow</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-030-58520-4_30</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>CERT: contrastive self-supervised learning for language understanding</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online May 16, 2020</comment>
          <pub-id pub-id-type="doi">10.36227/techrxiv.12308378.v1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mikolov</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distributed representations of words and phrases and their compositionality</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online October 16, 2013</comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shukla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Petridis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pantic</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Does visual self-supervision improve learning of speech representations for emotion recognition?</article-title>
          <source>IEEE Trans Affective Comput</source>
          <year>2023</year>
          <month>1</month>
          <day>1</day>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>406</fpage>
          <lpage>420</lpage>
          <pub-id pub-id-type="doi">10.1109/taffc.2021.3062406</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Franceschi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dieuleveut</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jaggi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Unsupervised scalable representation learning for multivariate time series</article-title>
          <source>arXiv</source>
          <comment> Preprint posted online January 30, 2019</comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oord</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Vinyals</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Representation learning with contrastive predictive coding</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online July 10, 2018</comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lemkhenter</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Favaro</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Boosting generalization in bio-signal classification by learning the phase-amplitude coupling</article-title>
          <year>2020</year>
          <conf-name>42nd DAGM GCPR: DAGM German Conference on Pattern Recognition</conf-name>
          <conf-date>September 28-October 1, 2020</conf-date>
          <conf-loc>Tübingen, Germany</conf-loc>
          <pub-id pub-id-type="doi">10.1007/978-3-030-71278-5_6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Banville</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chehab</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Hyvärinen</surname>
              <given-names>Aapo</given-names>
            </name>
            <name name-style="western">
              <surname>Engemann</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gramfort</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Uncovering the structure of clinical EEG signals with self-supervised learning</article-title>
          <source>J Neural Eng</source>
          <year>2021</year>
          <month>03</month>
          <day>31</day>
          <volume>18</volume>
          <issue>4</issue>
          <fpage>046020</fpage>
          <pub-id pub-id-type="doi">10.1088/1741-2552/abca18</pub-id>
          <pub-id pub-id-type="medline">33181507</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Azemi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dogrusoz</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Tuzel</surname>
              <given-names>CO</given-names>
            </name>
          </person-group>
          <article-title>Subject-aware contrastive learning for biosignals (US Patent US20210374570A1)</article-title>
          <source>Google Patents</source>
          <year>2021</year>
          <access-date>2023-06-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://patents.google.com/patent/US20210374570A1/en?q=(Subject-aware+contrastive+learning+biosignals)&amp;oq=Subject-aware+contrastive+learning+for+biosignals">https://patents.google.com/patent/US20210374570A1/en?q=(Subject-aware +contrastive+learning+biosignals)&amp;oq=Subject-aware+contrastive+learning+for+biosignals</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="web">
          <article-title>Open EEG Data Preprocessing and SSL Baselines</article-title>
          <source>GitHub</source>
          <year>2023</year>
          <access-date>2023-06-26</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/ycq091044/ContraWR">https://github.com/ycq091044/ContraWR</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
