<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v2i1e41818</article-id>
      <article-id pub-id-type="pmid">38875580</article-id>
      <article-id pub-id-type="doi">10.2196/41818</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Extractive Clinical Question-Answering With Multianswer and Multifocus Questions: Data Set Development and Evaluation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Malin</surname>
            <given-names>Bradley</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yin</surname>
            <given-names>Zhijun</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Danciu</surname>
            <given-names>Ioana</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Moon</surname>
            <given-names>Sungrim</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9191-3897</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>He</surname>
            <given-names>Huan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1312-4195</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Jia</surname>
            <given-names>Heling</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-5906-6577</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Hongfang</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2570-3741</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Fan</surname>
            <given-names>Jungwei Wilfred</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Artificial Intelligence &#38; Informatics</institution>
            <institution>Mayo Clinic</institution>
            <addr-line>200 1st Street SW</addr-line>
            <addr-line>RO_HA_07_741B-I</addr-line>
            <addr-line>Rochester, MN, 55905</addr-line>
            <country>United States</country>
            <phone>1 507 538 1191</phone>
            <email>Fan.Jung-wei@mayo.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6349-3752</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Artificial Intelligence &#38; Informatics</institution>
        <institution>Mayo Clinic</institution>
        <addr-line>Rochester, MN</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Jungwei Wilfred Fan <email>Fan.Jung-wei@mayo.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>20</day>
        <month>6</month>
        <year>2023</year>
      </pub-date>
      <volume>2</volume>
      <elocation-id>e41818</elocation-id>
      <history>
        <date date-type="received">
          <day>9</day>
          <month>8</month>
          <year>2022</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>11</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>31</day>
          <month>1</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>22</day>
          <month>5</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Sungrim Moon, Huan He, Heling Jia, Hongfang Liu, Jungwei Wilfred Fan. Originally published in JMIR AI (https://ai.jmir.org), 20.06.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2023/1/e41818" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Extractive question-answering (EQA) is a useful natural language processing (NLP) application for answering patient-specific questions by locating answers in their clinical notes. Realistic clinical EQA can yield multiple answers to a single question and multiple focus points in 1 question, which are lacking in existing data sets for the development of artificial intelligence solutions.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to create a data set for developing and evaluating clinical EQA systems that can handle natural multianswer and multifocus questions.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We leveraged the annotated relations from the 2018 National NLP Clinical Challenges corpus to generate an EQA data set. Specifically, the 1-to-N, M-to-1, and M-to-N drug-reason relations were included to form the multianswer and multifocus question-answering entries, which represent more complex and natural challenges in addition to the basic 1-drug-1-reason cases. A baseline solution was developed and tested on the data set.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The derived RxWhyQA data set contains 96,939 QA entries. Among the answerable questions, 25% of them require multiple answers, and 2% of them ask about multiple drugs within 1 question. Frequent cues were observed around the answers in the text, and 90% of the <italic>drug</italic> and <italic>reason</italic> terms occurred within the same or an adjacent sentence. The baseline EQA solution achieved a best <italic>F</italic><sub>1</sub>-score of 0.72 on the entire data set, and on specific subsets, it was 0.93 for the unanswerable questions, 0.48 for single-drug questions versus 0.60 for multidrug questions, and 0.54 for the single-answer questions versus 0.43 for multianswer questions.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The RxWhyQA data set can be used to train and evaluate systems that need to handle multianswer and multifocus questions. Specifically, multianswer EQA appears to be challenging and therefore warrants more investment in research. We created and shared a clinical EQA data set with multianswer and multifocus questions that would channel future research efforts toward more realistic scenarios.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>question-answering</kwd>
        <kwd>information extraction</kwd>
        <kwd>dataset</kwd>
        <kwd>data set</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>natural language processing</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>The thought process involved in clinical reasoning and decision-making can be naturally framed into a series of questions and answers [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Achieving human-like question-answering (QA) capability is highly regarded in artificial intelligence (AI). Medical QA research has garnered terrific momentum over the past decade, and a new generation of AI scientists is undergoing a state-of-the-art update at a daunting pace almost every month (if not every week). One of the very sought-after applications is to find the answer within a given document, or extractive QA (EQA), which enables patient-specific QA based on the information provided in the clinical text [<xref ref-type="bibr" rid="ref3">3</xref>]. As an essential component in most AI engineering undertakings, EQA training data determine not only the likelihood of success in terms of annotation quality but also the fidelity of representing the target scenario.</p>
        <p>Along with other issues observed in existing medical EQA corpora [<xref ref-type="bibr" rid="ref4">4</xref>], the mainstream annotation approach knowingly simplifies the task into a “one answer per document” scheme. Although the simplification makes development and evaluation easier for promoting initial growth of the field, it is unrealistic because EQA can naturally have multiple qualified answers (or answer components) within 1 document, and often all of them must be captured to sufficiently answer a question [<xref ref-type="bibr" rid="ref5">5</xref>]. Moreover, a question can naturally involve multiple focus points such as “Why A, B, and C…” rather than requiring the user to ask 1 question for each point. To address this gap, we created an EQA data set that involves realistic, multianswer and multifocus cases by converting the concept-relation annotations from an existing clinical natural language processing (NLP) challenge data set. Our generated RxWhyQA data set includes a total of 96,939 QA entries, where 25% of the answerable questions require the identification of multiple answers and 2% of them ask about multiple drugs within 1 question. We also developed a baseline solution for multianswer QA and tested it on the RxWhyQA.</p>
        <p>The novelty of this study is reframing the original relation identification task into an EQA task, which simplifies the conventional 2-step approach of named entity recognition and relation classification into 1-step information extraction guided by natural language questions. Our primary contribution is the RxWhyQA as a resource that offers realistic constructs to facilitate NLP research in this underexplored area. To our knowledge, there has not been any EQA data set that contains multianswer and multifocus questions based on clinical notes.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <p>QA is a versatile task that can subsume diverse NLP tasks when properly represented [<xref ref-type="bibr" rid="ref6">6</xref>]. More than a decade of research has focused on the EQA task in NLP [<xref ref-type="bibr" rid="ref7">7</xref>]. As the name implies, EQA can be viewed as question-guided information extraction from a given text. Unlike conventional approaches that require the identification of the involved entities as one task followed by determination of the target relation between the entities as the other task, EQA consolidates these steps into a smooth one-shot task where the user asks a natural language question for the system to understand the focus point, identify relevant cues in the text, and locate the answer that satisfies the relation of interest. Although EQA demands higher machine intelligence, it is efficient in terms of the data schema for modeling and the human-computer interaction for users.</p>
        <p>The Stanford Question Answering Dataset (SQuAD) [<xref ref-type="bibr" rid="ref8">8</xref>] established a widely adopted framework for EQA, and in the later version (version 2.0) [<xref ref-type="bibr" rid="ref9">9</xref>], the task also requires a system to refrain from answering when no suitable answer is present in the text. In the clinical domain, corpora have been developed for EQA based on electronic health records (EHRs). In the study by Raghavan et al [<xref ref-type="bibr" rid="ref10">10</xref>], medical students were presented with structured and unstructured EHR information about each patient to generate realistic questions for a hypothetical office encounter. Using the BioASQ data set based on biomedical literature, Yoon et al [<xref ref-type="bibr" rid="ref5">5</xref>] proposed a sequence tagging approach to handling multianswer EQA. In the consumer health domain, Zhu et al [<xref ref-type="bibr" rid="ref11">11</xref>] developed a Multiple Answer Spans Healthcare Question Answering (ie, MASH-QA) data set specifically involving multiple answers of nonconsecutive spans in the target text. As a non-English example, Ju et al [<xref ref-type="bibr" rid="ref12">12</xref>] developed a Conditional Multiple-span Chinese Question Answering data set from a web-based QA forum. Pampari et al [<xref ref-type="bibr" rid="ref13">13</xref>] developed the emrQA, a large clinical EQA corpus generated through template-based semantic extraction from the Informatics for Integrating Biology &#38; the Bedside NLP challenge data sets. We took a similar approach as the emrQA but additionally included multianswer and multifocus questions that better reflect natural clinical EQA scenarios.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Generating the QA Annotations From a Relation Identification Challenge</title>
        <p>Our source data were based on the annotations originally created for the National NLP Clinical Challenges (n2c2) corpus of 2018, which aimed to identify adverse drug events by extracting various drug-related concepts and classifying their relations in the clinical text [<xref ref-type="bibr" rid="ref14">14</xref>]. Their final gold standard included 83,869 concepts and 59,810 relations in 505 discharge summaries. In this study, we focused on generating QA pairs from the subset of drug and reason concepts (ie, mainly about the prescribing justification) and the relations between the concepts. Each relation consisted of 2 arguments: a drug concept and a reason concept, as in an example pair such as <italic>drug-reason</italic> (morphine-pain). Accordingly, a question around the drug concept could be derived, such as “Why was morphine prescribed to the patient?” and the reason concept “pain” would be designated as the answer. In the n2c2 corpus, each pair of drug and reason concepts had their text mentions annotated in the corresponding clinical document. The properties make for a good EQA data set where the system is expected to consider the actual contexts surrounding the drug and reason rather than performing a simple lookup. This is especially important for extracting off-label uses because a standard indication knowledge base would not cover those exceptions documented in real-world clinical text.</p>
        <p>From the n2c2 annotations on each clinical document, we leveraged several relation types between the drug and reason concepts: 1 drug 0 reason, 1 drug 1 reason, 1 drug N reasons, N drugs 1 reason, or M drugs N reasons. The most straightforward were the 1-drug-1-reason relations (eg, the morphine-pain relation mentioned above), each translated into a 1-to-1 QA entry. The 1-drug-0-reason relations apparently corresponded to the 1-to-0 (unanswerable) QA entries. We preserved the 1-drug-N-reasons relations directly as 1-to-N QAs that require locating multiple answers in the text. For the N-drugs-1-reason and M-drugs-N-reasons relations, we preserved the original multidrug challenge in questions such as, “Why were amlodipine, metoprolol, and isosorbide prescribed to the patient?” The M-drugs-N-reasons relations would also derive multianswer entries such as those derived from the 1-drug-N-reasons relations. In addition to the generated QA entries, we also supplemented paraphrastic questions [<xref ref-type="bibr" rid="ref15">15</xref>] that may enhance the generalizability of the trained systems.</p>
      </sec>
      <sec>
        <title>Quantitative and Qualitative Analysis of the Derived QA Annotations</title>
        <p>Along with descriptive statistics of the QA entries and the number of answers per question, we computed the frequencies of the specific drug and reason concept terms (after applying lexical normalization such as lowercase) among the QA entries. The frequencies were meant to offer an intuitive estimate of the abundance of train/test data available for each specific concept or concept pair. We then randomly sampled 100 QA entries for manual review: 50 from those with a single answer and 50 from those with multiple answers. The common patterns informative to QA inference were summarized, offering evidence on what the potential AI solutions could leverage. In addition, we measured the distance (by the number of sentences) between the question and answer concepts. For each specific drug-reason pair, we considered the shortest distance if there were multiple occurrences of either concept. The distance was deemed 0 if the pair occurred within the same sentence. Distance may serve as a surrogate for measuring the challenge to AI systems, where a longer distance implies a more challenging task. In addition, we sampled 100 random drug-reason pairs from each test run (experimental setup described below) to estimate the prevalence of off-label uses in the derived data set. The MEDication-Indication (MEDI) knowledge base (version 2) high-precision subset [<xref ref-type="bibr" rid="ref16">16</xref>] was first used to screen for on-label uses by exact string match (with normalizing to lowercase), and the remaining drug-reason pairs were reviewed by a domain expert (HJ) to determine off-label uses.</p>
      </sec>
      <sec>
        <title>Development of a Baseline Solution</title>
        <sec>
          <title>Data Preparation and Model Training</title>
          <p>The annotations conform to the SQuAD 2.0 JSON format and can be readily used to train Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref17">17</xref>] for EQA tasks. We randomly partitioned the data set into the train, develop (dev), and test sets by the 5:2:3 ratio, corresponding to 153, 50, and 100 clinical documents, respectively. Random partitioning was carried out 3 times, each executed as a separate run of the experiment for quantifying performance variability. The base language model was ClinicalBERT [<xref ref-type="bibr" rid="ref18">18</xref>], a domain-customized BERT trained on approximately 2 million clinical documents from the MIMIC-III (version 1.4) database. We fine-tuned ClinicalBERT first on a why-question subset of SQuAD 2.0, followed by fine-tuning on the train set. Training parameters used in the ClinicalBERT fine-tuning were batch_train_size=32, max_seq_length=128, doc_stride=64, learning_rate=3e-5, and epochs=5. The dev set was then used to learn the threshold for determining when the ClinicalBERT model should refrain from providing any answer.</p>
        </sec>
        <sec>
          <title>Incremental Masking to Generate Multiple Answers</title>
          <p>To force the fine-tuned ClinicalBERT model to continue seeking other suitable answers in each clinical document, we implemented the following process on the test set as a heuristic baseline:</p>
          <list list-type="order">
            <list-item>
              <p>Let the EQA model complete its usual single-answer extraction and record the string of the top answer. No further action is needed if the model refrains from answering.</p>
            </list-item>
            <list-item>
              <p>Perform a case-insensitive string search using the top answer (from step 1 above) throughout the clinical note from where it was extracted and replace every occurrence into a dummy underscore “______” string of identical length. This literally generates a new version of the text by masking the original top answer in each question.</p>
            </list-item>
            <list-item>
              <p>Run the same EQA model for another round on the entire masked test set again to determine whether the model could identify additional answers elsewhere or started to refrain from answering.</p>
            </list-item>
          </list>
          <p>The 3 abovementioned steps were repeated until the model did not generate any new answers on the entire test set. Together, model training and the heuristic multianswer generation process are summarized in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>A flowchart of our heuristic approach to constructing a single-answer extractive question-answering model generates multiple answers by incremental masking. The main steps go from left to right. The upper-right “Answer-masking” box illustrates an example of masking where the model’s answer “leg edema” is replaced with a dummy underscore to force the model to look for viable alternative answers elsewhere in the text. BERT: Bidirectional Encoder Representations from Transformers; dev: develop; n2c2: National NLP Clinical Challenges; NLP: natural language processing; SQuAD: Stanford Question Answering Dataset.</p>
            </caption>
            <graphic xlink:href="ai_v2i1e41818_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>Evaluation of the Baseline Solution</title>
        <p>After the first round of masking, we began to have more than 1 answer generated by the model for some of the questions. Accordingly, the evaluation program (specifically for the overlap mode) was adapted to accommodate such M-to-N answer comparisons in determining the token-wise proportional match. When anchoring on each gold-standard answer, we selected the model answer with the most overlapping tokens as the best answer in setting the weighted true positive (TP) and false negative (FN); the weighted false positive (FP) was set vice versa by anchoring on each model answer—see equations 1-4 for definitions. On top of these weighted matches between gold-standard and model answers in each question, we tallied them over each entire test set to compute the solution’s precision, recall, and <italic>F</italic><sub>1</sub>-score, followed by qualitative error analysis.</p>
        <disp-formula>
          <graphic xlink:href="ai_v2i1e41818_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="ai_v2i1e41818_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="ai_v2i1e41818_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <disp-formula>
          <graphic xlink:href="ai_v2i1e41818_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Descriptive Statistics of the Derived RxWhyQA Data Set</title>
        <p>We leveraged a total of 10,489 relations from the n2c2 adverse drug events NLP challenge and derived the data set, consisting of 96,939 QA entries. <xref ref-type="table" rid="table1">Table 1</xref> summarizes the 5 major drug-reason relation categories in the n2c2 corpus, the strategies that we implemented to convert them into QA entries, and their resulting frequencies. <xref ref-type="table" rid="table2">Table 2</xref> shows the distribution for the number of answers per question: 75% of the questions have a single answer, while 25% of them require multiple answers. Duplicate answer terms located at different positions of the clinical documents were retained. For example, the procedure “CT” might be mentioned at several places in the text and be recorded as the answer to “Why was the patient prescribed contrast?” We included each such identical term and their different offsets as multiple answers so that the EQA solutions may leverage such nuances. The final data set was formatted into a SQuAD-compatible JSON file and shared through the n2c2 community annotations repository [<xref ref-type="bibr" rid="ref19">19</xref>]. <xref rid="figure2" ref-type="fig">Figure 2</xref> illustrates a multianswer entry in the RxWhyQA data set.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Categories, examples, and conversion strategies for making the drug-reason relations into the extractive question-answering annotations.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="220"/>
            <col width="340"/>
            <col width="260"/>
            <col width="180"/>
            <thead>
              <tr valign="bottom">
                <td>Category in the n2c2<sup>a</sup> corpus</td>
                <td>Example</td>
                <td>Conversion strategy</td>
                <td>Entries, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1 <italic>Drug</italic>, no Reason</td>
                <td><italic>Mirtazapine</italic> 15 mg PO QHS<sup>b</sup> (only the drug is mentioned but no reason is documented)</td>
                <td>Make an unanswerable QA<sup>c</sup> entry</td>
                <td>46,278</td>
              </tr>
              <tr valign="top">
                <td>1 <italic>Drug</italic>, 1 Reason</td>
                <td>The patient received <italic>morphine</italic> for pain as needed</td>
                <td>Make a 1-to-1 QA entry</td>
                <td>28,224<sup>d</sup></td>
              </tr>
              <tr valign="top">
                <td>N <italic>Drugs</italic>, 1 Reason</td>
                <td>Hypertension: severely elevated blood pressure. Started <italic>amlodipine</italic>, <italic>metoprolol</italic>, and <italic>isosorbide</italic>.</td>
                <td>Break into N separate 1-to-1 relations and make each a 1-to-1 QA entry</td>
                <td>N/A<sup>e</sup></td>
              </tr>
              <tr valign="top">
                <td>1 <italic>Drug</italic>, N Reasons</td>
                <td><italic>Albuterol sulfate</italic> 90 mcg… Puff Inhalation Q4H<sup>f</sup> for sob or wheeze.</td>
                <td>List the N reasons under the answer block to form a 1-to-N QA entry</td>
                <td>22,437<sup>g</sup></td>
              </tr>
              <tr valign="top">
                <td>M <italic>Drug</italic>, N Reasons</td>
                <td>Left frontoparietal stroke - maintained on <italic>ASA</italic><sup>h</sup> and <italic>plavix ….</italic> Hx of CVA<sup>i</sup><italic>:</italic> restarted <italic>ASA/Plavix</italic> per the GI<sup>j</sup> team's recommendation<italic>.</italic></td>
                <td>List the N reasons under answer block to form an M-to-N QA entry</td>
                <td>N/A</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>n2c2: National NLP (natural language processing) Clinical Challenges.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>PO QHS: one pill to be taken orally at bedtime.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>QA: question-answering.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>28,224 entries in total for the 1-drug-1-reason and N-drugs-1-reason categories together in the corpus.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>N/A: not applicable.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>Q4H: every 4 hours.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>22,437 entries in total for the 1-drug-N-reasons and M-drug-N-reasons categories in together in the corpus.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>ASA: acetylsalicylic acid (aspirin).</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>Hx of CVA: history of cerebrovascular accident.</p>
            </fn>
            <fn id="table1fn10">
              <p><sup>j</sup>GI: gastrointestinal.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Unique answers among answerable questions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Frequency</td>
                <td>Unique answers, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>28,224 (75)</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>6804 (18)</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>1530 (4)</td>
              </tr>
              <tr valign="top">
                <td>≥4</td>
                <td>954 (3)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>A multianswer entry in the generated RxWhyQA data set. The “id” field is the unique ID for the question-answering entry in the data set. The “_mname” field indicates the medication name; that is, the anchor concept in the question. The “answer_start” is the character offset where the answer term occurs in the clinical document, which is hosted in the “context” field (not shown here). When “is_impossible” is false, the question-answering entry is answerable.</p>
          </caption>
          <graphic xlink:href="ai_v2i1e41818_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Content Analysis of the RxWhyQA Data Set</title>
        <p>The 5 most frequently asked drug terms (with noting the number of QA entries) in the answerable questions (frequencies) were the following: coumadin (1278), vancomycin (1170), lasix (963), acetaminophen (801), and antibiotics (783). Without any overlap, the 5 most frequent drug terms in the unanswerable questions were the following: docusate sodium (648), metoprolol tartrate (504), aspirin (468), pantoprazole (450), and penicillins (414). Among the answerable QA entries, the 5 most frequently seen pairs were the following: acetaminophen-pain (504), senna-constipation (369), oxycodone-pain (261), coumadin-afib (252), and acetaminophen-fever (234). As a potential surrogate measure of task difficulty, <xref ref-type="table" rid="table3">Table 3</xref> shows the distribution for the number of sentences between the question anchor and answer term in each answerable QA entry. The majority (n=32,409, 72%) of the drug and reason terms occur within the same sentence, and the portion increases to 90% (72%+18%) when adding those with the drug and reason occurring in an adjacent sentence (ie, distance=1). In the extreme case, the drug and reason terms are 16 sentences apart from each other. <xref ref-type="table" rid="table4">Table 4</xref> summarizes the commonly observed contexts from manually reviewing 100 random samples of the answerable QA entries. There were 7, 10, and 3 off-label uses, respectively, in each of the random 100 drug-reason pairs reviewed by the domain expert, making the estimate of off-label uses average at 6.7% in the RxWhyQA data set. The detailed off-label review results are available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Distribution for the distance between question and answer terms (0=the question and answer terms occur in the same sentence).</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Distance (be sentence) between the question and answer items</td>
                <td>QA<sup>a</sup> entries, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>0</td>
                <td>32,409</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>8154</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>2646</td>
              </tr>
              <tr valign="top">
                <td>3</td>
                <td>1188</td>
              </tr>
              <tr valign="top">
                <td>4</td>
                <td>405</td>
              </tr>
              <tr valign="top">
                <td>5</td>
                <td>153</td>
              </tr>
              <tr valign="top">
                <td>6</td>
                <td>81</td>
              </tr>
              <tr valign="top">
                <td>7</td>
                <td>72</td>
              </tr>
              <tr valign="top">
                <td>8</td>
                <td>27</td>
              </tr>
              <tr valign="top">
                <td>9</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>10</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>11</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>12</td>
                <td>9</td>
              </tr>
              <tr valign="top">
                <td>13</td>
                <td>9</td>
              </tr>
              <tr valign="top">
                <td>14</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>15</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>16</td>
                <td>9</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>QA: question-answering.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Common patterns (observed &#62;10 times) between the question and the answer terms in 100 random question-answering entries. Each reason or drug represents where a question or answer anchor term occurs in the pattern. The shorthands are used as follows: ellipsis stands for 0 to multiple words, parentheses denote scoping, square brackets with pipes indicate a boolean OR set, and a question mark denotes a binary quantifier for presence or absence.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="850"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Pattern</td>
                <td>Frequency</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Reason … (being)? [received&#124;started&#124;restarted&#124;required&#124;maintained&#124;continued?] (on)? <italic>Drug</italic></td>
                <td>25</td>
              </tr>
              <tr valign="top">
                <td><italic>Drug</italic> … [prn&#124;PRN&#124;(as needed for)?] Reason</td>
                <td>18</td>
              </tr>
              <tr valign="top">
                <td><italic>Drug</italic> … (was)? [attempted&#124;given&#124;dosing&#124;taking] for (any)? [possible&#124;likely&#124;presumed]? Reason</td>
                <td>14</td>
              </tr>
              <tr valign="top">
                <td>Reason … (was)? [managed&#124;treated&#124;improved&#124;recommended&#124;downtrended&#124;resolved&#124;reversed&#124;needed] with <italic>Drug</italic></td>
                <td>13</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>F1-Score of the Baseline EQA Solution</title>
        <p>The performance in determining the <italic>F</italic><sub>1</sub>-score across 3 experimental runs is summarized in <xref rid="figure3" ref-type="fig">Figure 3</xref>, where the subfigures represent different slices. Specifically, the underlying set relations are the following: the full set (<xref rid="figure3" ref-type="fig">Figure 3</xref>A) minus the unanswerable questions (<xref rid="figure3" ref-type="fig">Figure 3</xref>B) yields the answerable questions, which can be represented by either single-answer questions (<xref rid="figure3" ref-type="fig">Figure 3</xref>C) plus multianswer questions (<xref rid="figure3" ref-type="fig">Figure 3</xref>D) if sliced per the number of answers or by questions asking about a single drug (<xref rid="figure3" ref-type="fig">Figure 3</xref>E) plus questions asking about multiple drugs (<xref rid="figure3" ref-type="fig">Figure 3</xref>F) if sliced per the number of drugs asked in the question. Each bar represents the average <italic>F</italic><sub>1</sub>-score across the runs and with the range marked for each incremental masking step. As seen in <xref rid="figure3" ref-type="fig">Figure 3</xref>A, the overall <italic>F</italic><sub>1</sub>-score increased immediately after applying the first round of answer masking (from “original” to “mask 1”, <italic>P</italic>&#60;.05), which then stayed constant throughout the remaining mask iterations. The increase in the <italic>F</italic><sub>1</sub>-score in <xref rid="figure3" ref-type="fig">Figure 3</xref>A corresponds to the exact pattern in <xref rid="figure3" ref-type="fig">Figure 3</xref>D, suggesting that the performance gain was mainly from the multianswer questions; that is, the target originally intended by the masking. Multianswer questions appear to be more challenging than single-answer questions on comparing <xref rid="figure3" ref-type="fig">Figures 3</xref>C and 3D. According to <xref rid="figure3" ref-type="fig">Figures 3</xref>E and 3F, asking about multiple drugs at once made it easier for the model to find the right answer, albeit with wide performance variation. The BERT model was good at refraining from answering unanswerable questions, as indicated by the high <italic>F</italic><sub>1</sub>-scores in <xref rid="figure3" ref-type="fig">Figure 3</xref>B. The detailed results of the 3 experimental runs are available in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. There were 189 QA entries associated with the off-label uses identified by manually reviewing 300 random drug-reason pairs from the 3 test runs, all of which happened to be single-answer cases. We computed for this small set a single aggregate <italic>F</italic>1-score, which was 0.49 and appeared consistently lower than the range shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>C.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p><italic>F</italic><sub>1</sub>-scores of the fine-tuned Bidirectional Encoder Representations from Transformers extractive question-answering model across the incremental masking rounds. Each bar represents the average <italic>F</italic><sub>1</sub>-score based on 3 experimental runs, with the minimum and maximum range marked (light blue). (A) The full set, (B) unanswerable questions, (C) questions with exactly 1 answer, (D) questions with multiple answers, (E) questions asking about a single drug, and (F) questions asking about multiple drugs.</p>
          </caption>
          <graphic xlink:href="ai_v2i1e41818_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Significance and Contributions</title>
        <p>Although why-QA only covers a subdomain of clinical QA, it represents a unique category that deals with the cause, motivation, circumstance, and justification. It was estimated that 20% of the top 10 question types asked by family physicians [<xref ref-type="bibr" rid="ref20">20</xref>] could be rephrased into a why-question. Clinical why-QA is important because (1) the ultimate task resembles expert-level explanatory synthesis of knowledge and evidence and (2) it aligns with identifying reasons for the decisions documented in clinical text. Therefore, the contents and challenges offered by the RxWhyQA data set itself have independent, practical value for developing clinical QA applications. Although drug-reason QA appears to be a niche topic, a working solution developed on the data set can broadly benefit research around adherence to clinical guidelines, care quality assessment, and health disparity from prescribing variations.</p>
        <p>The generated RxWhyQA data set can serve as the training and testing of AI systems that target excerpting pertinent information in a clinical document to answer patient-specific questions. In addition to the unanswerable questions that require a system to refrain from extracting FP answers, the RxWhyQA data set features 9288 questions that require the system to identify multiple answers, which is a realistic challenge in clinical QA. The data set also contains 611 questions that ask about the reason for prescribing multiple drugs at once. The multianswer and multifocus questions represent a key improvement beyond existing clinical EQA data sets, of which the rigid constructs would preclude AI solutions from learning to deal with more realistic use scenarios. Additionally, our experiments on these special constructs validated the challenging nature of multianswer questions and revealed that multifocus questions may turn out to be easier due to the availability of richer information for use by the model. Our drug-reason–focused data set may offer a coherent theme that enables better controlled experiments to compare how the different QA constructs (eg, single- vs multianswer questions) affect AI system performance.</p>
      </sec>
      <sec>
        <title>Properties Found About the RxWhyQA Data Set</title>
        <p>The frequent drugs and drug-reason pairs likely imply the clinical practice in the original n2c2 cohort. The finding that the top 5 drugs in the unanswerable questions (ie, no answer provided in the gold-standard annotation) were different from those in the answerable questions suggests that the prescription of certain drugs might be self-evident without needing a documented reason. Our question-answer–mentioning distance analysis showed that 90% of the drug-reason pairs were within the same or an adjacent sentence in the RxWhyQA data set, indicating modest demand for long-distance inference by AI solutions. We were able to identify frequent contextual patterns such as “PRN” (ie, pro re nata) or “as needed for” (<xref ref-type="table" rid="table4">Table 4</xref>) that AI models may learn to facilitate locating the answers. It is estimated that the data set contains 6.7% of off-label drug uses as the target answers, which would be useful for training systems to identify such cases and facilitate research on understanding the medical practice variation or innovation.</p>
      </sec>
      <sec>
        <title>Behavior of the Baseline EQA Solution</title>
        <p>The notable increase in the <italic>F</italic><sub>1</sub>-score (<xref rid="figure3" ref-type="fig">Figure 3</xref>D) after applying 1 round of masking suggests that the masking effectively forced the BERT model to look elsewhere, which resulted in an increase in the <italic>F</italic><sub>1</sub>-score by retrieving the majority of the additional answers (see <xref ref-type="table" rid="table2">Table 2</xref>). Interestingly, we noticed in many cases that the model clung on to the masked span (ie, capturing the “________” as an answer) where some of such strong contextual patterns were present. This phenomenon supports that transformer-based EQA models do leverage contextual information than merely memorizing the surface question-answer pairs. Moreover, our post hoc inspection noted that correct (synonymous) answers were found by the model that were not in the gold-standard annotation (eg, “allergic reaction” versus “anaphylaxis” to a question about “epipen”), suggesting that the performance could be underestimated. As a caveat, we were aware that our baseline solution was essentially a convenient hack that made a model trained for single-answer EQA to find multiple answers through a stepwise probing procedure. As more advanced approaches constantly emerge [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], we welcome the research community to evaluate them by using the RxWhyQA data set. For example, the lower <italic>F</italic><sub>1</sub>-score on those off-label uses indicates that they might represent challenging cases and demand more robust AI solutions.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>We admit several limitations in this study: (1) the source n2c2 corpus represented a specific cohort that may not generalize to every clinical data set, (2) we did not exhaustively diversify the paraphrastic questions but left it for future exploration on other promising approaches [<xref ref-type="bibr" rid="ref23">23</xref>], (3) we did not intend to extensively compare state-of-the-art solutions for multianswer QA but rather intended to offer a convenience baseline along with releasing the RxWhyQA corpus, (4) the drug-reason relations represent a narrow topic for EQA development and evaluation. However, we believe that the definite theme would preferably make it a less confounded test set for assessing the effect of multianswer and multifocus questions on AI systems.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We derived and shared the RxWhyQA, an EQA data set for training and testing systems to answer patient-specific questions based on clinical documents. The RxWhyQA data set includes 9288 multianswer questions and 611 multifocus questions, each representing a critical scenario not well covered by existing data sets. Upon evaluating a baseline solution, the multianswer questions appeared to be more challenging than single-answer questions. Although the RxWhyQA focuses on why-questions derived from drug-reason relations, it offers a rich data set involving realistic constructs and exemplifies an innovation in recasting NLP annotations of different tasks for EQA research.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Manual annotation of off-label uses in 300 randomly sampled drug-reason QA pairs from the test sets.</p>
        <media xlink:href="ai_v2i1e41818_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 40 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Detailed <italic>F</italic><sub>1</sub>-scores of the BERT model across three test runs, on the different subsets, with applying the incremental answer-masking.</p>
        <media xlink:href="ai_v2i1e41818_app2.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 13 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">dev</term>
          <def>
            <p>develop</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">EQA</term>
          <def>
            <p>extractive question-answering</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">FN</term>
          <def>
            <p>false negative</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">FP</term>
          <def>
            <p>false positive</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">n2c2</term>
          <def>
            <p>National NLP Clinical Challenges</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">QA</term>
          <def>
            <p>question-answering</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">SQuAD</term>
          <def>
            <p>Stanford Question Answering Dataset</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">TP</term>
          <def>
            <p>true positive</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We thank the n2c2 organizers for making the annotations available to the research community. The study was partly supported by the Mayo Clinic Kern Center for the Science of Health Care Delivery. The research was supported by the National Center for Advancing Translational Sciences (U01TR002062).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>JWF conceived the study. HL offered scientific consultation. SM implemented the data conversion and analysis. HH assisted in the data conversion and graphic presentation. HJ reviewed and determined the off-label drug uses. SM and JWF drafted the manuscript. All authors contributed to the interpretation of the results and critical revision of the manuscript, and approved the final submission.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cimino</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Putting the "why" in "EHR": capturing and coding clinical cognition</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2019</year>
          <month>11</month>
          <day>01</day>
          <volume>26</volume>
          <issue>11</issue>
          <fpage>1379</fpage>
          <lpage>1384</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31407781"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz125</pub-id>
          <pub-id pub-id-type="medline">31407781</pub-id>
          <pub-id pub-id-type="pii">5549268</pub-id>
          <pub-id pub-id-type="pmcid">PMC6798564</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodwin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Harabagiu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Medical question answering for clinical decision support</article-title>
          <year>2016</year>
          <conf-name>CIKM'16: ACM Conference on Information and Knowledge Management</conf-name>
          <conf-date>October 24-28, 2016</conf-date>
          <conf-loc>Indianapolis, IN</conf-loc>
          <pub-id pub-id-type="doi">10.1145/2983323.2983819</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Biomedical question answering: a survey of approaches and challenges</article-title>
          <source>ACM Comput Surv</source>
          <year>2022</year>
          <month>01</month>
          <day>18</day>
          <volume>55</volume>
          <issue>2</issue>
          <fpage>1</fpage>
          <lpage>36</lpage>
          <pub-id pub-id-type="doi">10.1145/3490238</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yue</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Clinical reading comprehension: a thorough analysis of the emrQA dataset</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online May 1, 2020</comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.410</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lagerberg</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Sequence tagging for biomedical extractive question answering</article-title>
          <source>Bioinformatics</source>
          <year>2022</year>
          <month>08</month>
          <day>02</day>
          <volume>38</volume>
          <issue>15</issue>
          <fpage>3794</fpage>
          <lpage>3801</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35713500"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btac397</pub-id>
          <pub-id pub-id-type="medline">35713500</pub-id>
          <pub-id pub-id-type="pii">6609766</pub-id>
          <pub-id pub-id-type="pmcid">PMC9344839</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McCann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Keskar</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Socher</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The natural language decathlon: multitask learning as question answering</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online June 20, 2018</comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A survey of extractive question answering</article-title>
          <year>2022</year>
          <conf-name>2022 International Conference on High Performance Big Data and Intelligent Systems (HDIS)</conf-name>
          <conf-date>December 10-11, 2022</conf-date>
          <conf-loc>Tianjin, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/hdis56859.2022.9991478</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lopyrev</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>SQuAD: 100,000+ questions for machine comprehension of text</article-title>
          <source>Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2016</year>
          <conf-name>2016 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>2016</conf-date>
          <conf-loc>Austin, TX</conf-loc>
          <fpage>2383</fpage>
          <lpage>2392</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/d16-1264</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Know what you don’t know: unanswerable questions for SQuAD</article-title>
          <source>Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</source>
          <year>2018</year>
          <conf-name>56th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>2018</conf-date>
          <conf-loc>Melbourne</conf-loc>
          <fpage>784</fpage>
          <lpage>789</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/p18-2124</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raghavan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Patwardhan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Devarakonda</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Annotating electronic medical records for question answering</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online May 17, 2018</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1805.06816</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ahuja</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Juan</surname>
              <given-names>D-C</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>CK</given-names>
            </name>
          </person-group>
          <article-title>Question answering with long multiple-span answers</article-title>
          <year>2020</year>
          <conf-name>The 2020 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>November 16-20, 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2020.findings-emnlp.342</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ju</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>CMQA: A Dataset of Conditional Question Answering with Multiple-Span Answers</article-title>
          <year>2022</year>
          <conf-name>29th International Conference on Computational Linguistics</conf-name>
          <conf-date>2022</conf-date>
          <conf-loc>Gyeongju, Republic of Korea</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pampari</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Raghavan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>emrQA: A large corpus for question answering on electronic medical records</article-title>
          <year>2018</year>
          <conf-name>The 2018 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>October 31-November 4, 2018</conf-date>
          <conf-loc>Brussels, Belgium</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/d18-1258</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Buchan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Filannino</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stubbs</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Uzuner</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>2018 n2c2 shared task on adverse drug events and medication extraction in electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>01</month>
          <day>01</day>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>12</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31584655"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz166</pub-id>
          <pub-id pub-id-type="medline">31584655</pub-id>
          <pub-id pub-id-type="pii">5581277</pub-id>
          <pub-id pub-id-type="pmcid">PMC7489085</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moon</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>How you ask matters: the effect of paraphrastic questions to BERT performance on a clinical SQuAD dataset</article-title>
          <year>2020</year>
          <conf-name>The 3rd Clinical Natural Language Processing Workshop</conf-name>
          <conf-date>November 19, 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2020.clinicalnlp-1.13</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Kerchberger</surname>
              <given-names>VE</given-names>
            </name>
            <name name-style="western">
              <surname>Borza</surname>
              <given-names>VA</given-names>
            </name>
            <name name-style="western">
              <surname>Eken</surname>
              <given-names>HN</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>An updated, computable MEDication-Indication resource for biomedical research</article-title>
          <source>Sci Rep</source>
          <year>2021</year>
          <month>09</month>
          <day>23</day>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>18953</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-021-98579-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-021-98579-4</pub-id>
          <pub-id pub-id-type="medline">34556781</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-021-98579-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC8460636</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online October 11, 2018</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boag</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>W-H</given-names>
            </name>
            <name name-style="western">
              <surname>Jindi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McDermott</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Publicly Available Clinical BERT Embeddings</article-title>
          <year>2019</year>
          <conf-name>The 2nd Clinical Natural Language Processing Workshop</conf-name>
          <conf-date>June 7, 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>n2c2 Data Upload: Community generated annotations</article-title>
          <source>DBMI Data Portal</source>
          <access-date>2023-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://portal.dbmi.hms.harvard.edu/projects/n2c2-du/">https://portal.dbmi.hms.harvard.edu/projects/n2c2-du/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ely</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Osheroff</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Ebell</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Bergus</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Chambliss</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Evans</surname>
              <given-names>ER</given-names>
            </name>
          </person-group>
          <article-title>Analysis of questions asked by family doctors regarding patient care</article-title>
          <source>BMJ</source>
          <year>1999</year>
          <month>08</month>
          <day>07</day>
          <volume>319</volume>
          <issue>7206</issue>
          <fpage>358</fpage>
          <lpage>361</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/10435959"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.319.7206.358</pub-id>
          <pub-id pub-id-type="medline">10435959</pub-id>
          <pub-id pub-id-type="pmcid">PMC28191</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>A multi-type multi-span network for reading comprehension that requires discrete reasoning</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online August 15, 2019</comment>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1170</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Segal</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Efrat</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shoham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Globerson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Berant</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A Simple and Effective Model for Answering Multi-span Questions</article-title>
          <year>2020</year>
          <conf-name>The 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>
          <conf-date>November 16-20, 2020</conf-date>
          <conf-loc>Online</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.248</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soni</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>A paraphrase generation system for EHR question answering</article-title>
          <year>2019</year>
          <conf-name>The 18th BioNLP Workshop and Shared Task</conf-name>
          <conf-date>August 1, 2019</conf-date>
          <conf-loc>Florence, Italy</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/w19-5003</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
