<?xml version='1.0'?>
<!DOCTYPE art SYSTEM 'http://www.biomedcentral.com/xml/article.dtd'>
<art>
<ui>gb-2012-13-12-r115</ui>
<ji>1465-6906</ji>
<fm>
<dochead>Research</dochead>
<bibl>
<title><p>Whole-genome reconstruction and mutational signatures in gastric cancer</p></title>
<aug>
<au ca="yes" ce="yes" id="A1"><snm>Nagarajan</snm><fnm>Niranjan</fnm><insr iid="I1"/><email>nagarajann@gis.a-star.edu.sg</email></au>
<au ce="yes" id="A2"><snm>Bertrand</snm><fnm>Denis</fnm><insr iid="I1"/><email>bertrandd@gis.a-star.edu.sg</email></au>
<au ce="yes" id="A3"><snm>Hillmer</snm><mi>M</mi><fnm>Axel</fnm><insr iid="I2"/><email>hillmer@gis.a-star.edu.sg</email></au>
<au ce="yes" id="A4"><snm>Zang</snm><mnm>Jiang</mnm><fnm>Zhi</fnm><insr iid="I3"/><insr iid="I4"/><email>zangncc@gmail.com</email></au>
<au id="A5"><snm>Yao</snm><fnm>Fei</fnm><insr iid="I2"/><insr iid="I5"/><email>yaof@gis.a-star.edu.sg</email></au>
<au id="A6"><snm>Jacques</snm><fnm>Pierre-&#201;tienne</fnm><insr iid="I1"/><email>jacquespe@gis.a-star.edu.sg</email></au>
<au id="A7"><snm>Teo</snm><mi>SM</mi><fnm>Audrey</fnm><insr iid="I2"/><email>teoa1@gis.a-star.edu.sg</email></au>
<au id="A8"><snm>Cutcutache</snm><fnm>Ioana</fnm><insr iid="I6"/><email>ioana.cutcutache@duke-nus.edu.sg</email></au>
<au id="A9"><snm>Zhang</snm><fnm>Zhenshui</fnm><insr iid="I2"/><email>zhangzs@gis.a-star.edu.sg</email></au>
<au id="A10"><snm>Lee</snm><mnm>Heng</mnm><fnm>Wah</fnm><insr iid="I1"/><email>leewhc@gis.a-star.edu.sg</email></au>
<au id="A11"><snm>Sia</snm><mnm>Yen</mnm><fnm>Yee</fnm><insr iid="I2"/><email>siayy@gis.a-star.edu.sg</email></au>
<au id="A12"><snm>Gao</snm><fnm>Song</fnm><insr iid="I7"/><email>gaosong0329@gmail.com</email></au>
<au id="A13"><snm>Ariyaratne</snm><mi>N</mi><fnm>Pramila</fnm><insr iid="I1"/><email>ariyaratnep@gis.a-star.edu.sg</email></au>
<au id="A14"><snm>Ho</snm><fnm>Andrea</fnm><insr iid="I2"/><email>hoa5@gis.a-star.edu.sg</email></au>
<au id="A15"><snm>Woo</snm><mnm>Yi</mnm><fnm>Xing</fnm><insr iid="I1"/><email>wooxy@gis.a-star.edu.sg</email></au>
<au id="A16"><snm>Veeravali</snm><fnm>Lavanya</fnm><insr iid="I8"/><email>veeravallil@gis.a-star.edu.sg</email></au>
<au id="A17"><snm>Ong</snm><mnm>Kiat</mnm><fnm>Choon</fnm><insr iid="I9"/><email>abelong@gmail.com</email></au>
<au id="A18"><snm>Deng</snm><fnm>Niantao</fnm><insr iid="I10"/><email>niantaodeng@gmail.com</email></au>
<au id="A19"><snm>Desai</snm><mi>V</mi><fnm>Kartiki</fnm><insr iid="I11"/><email>desaikv@gis.a-star.edu.sg</email></au>
<au id="A20"><snm>Khor</snm><mnm>Chuen</mnm><fnm>Chiea</fnm><insr iid="I4"/><insr iid="I12"/><email>khorcc@gis.a-star.edu.sg</email></au>
<au id="A21"><snm>Hibberd</snm><mi>L</mi><fnm>Martin</fnm><insr iid="I4"/><insr iid="I12"/><email>hibberdml@gis.a-star.edu.sg</email></au>
<au id="A22"><snm>Shahab</snm><fnm>Atif</fnm><insr iid="I8"/><email>shahabatif@gmail.com</email></au>
<au id="A23"><snm>Rao</snm><fnm>Jaideepraj</fnm><insr iid="I13"/><email>Jaideepraj_Rao@ttsh.com.sg</email></au>
<au id="A24"><snm>Wu</snm><fnm>Mengchu</fnm><insr iid="I14"/><email>csiwm@nus.edu.sg</email></au>
<au id="A25"><snm>Teh</snm><fnm>Ming</fnm><insr iid="I15"/><email>pathead@nus.edu.sg</email></au>
<au id="A26"><snm>Zhu</snm><fnm>Feng</fnm><insr iid="I16"/><email>mdczhuf@nus.edu.sg</email></au>
<au id="A27"><snm>Chin</snm><mnm>Yung</mnm><fnm>Sze</fnm><insr iid="I15"/><email>sze_yung_chin@nuhs.edu.sg</email></au>
<au id="A28"><snm>Pang</snm><fnm>Brendan</fnm><insr iid="I14"/><insr iid="I15"/><email>patv13@nus.edu.sg</email></au>
<au id="A29"><snm>So</snm><mi>BY</mi><fnm>Jimmy</fnm><insr iid="I17"/><email>jimmy_so@nuhs.edu.sg</email></au>
<au id="A30"><snm>Bourque</snm><fnm>Guillaume</fnm><insr iid="I18"/><insr iid="I19"/><email>guil.bourque@mcgill.ca</email></au>
<au id="A31"><snm>Soong</snm><fnm>Richie</fnm><insr iid="I14"/><insr iid="I15"/><email>csirs@nus.edu.sg</email></au>
<au id="A32"><snm>Sung</snm><fnm>Wing-Kin</fnm><insr iid="I1"/><email>wksung@gis.a-star.edu.sg</email></au>
<au id="A33"><snm>Tean Teh</snm><fnm>Bin</fnm><insr iid="I9"/><email>teh.bin.tean@singhealth.com.sg</email></au>
<au id="A34"><snm>Rozen</snm><fnm>Steven</fnm><insr iid="I6"/><email>steve.rozen@duke-nus.edu.sg</email></au>
<au id="A35"><snm>Ruan</snm><fnm>Xiaoan</fnm><insr iid="I2"/><email>ruanx@gis.a-star.edu.sg</email></au>
<au id="A36"><snm>Yeoh</snm><mnm>Guan</mnm><fnm>Khay</fnm><insr iid="I16"/><email>khay_guan_yeoh@nuhs.edu.sg</email></au>
<au ca="yes" id="A37"><snm>Tan</snm><mi>BO</mi><fnm>Patrick</fnm><insr iid="I10"/><insr iid="I12"/><insr iid="I14"/><email>tanbop@gis.a-star.edu.sg</email></au>
<au ca="yes" id="A38"><snm>Ruan</snm><fnm>Yijun</fnm><insr iid="I2"/><insr iid="I20"/><email>ruanyj@gis.a-star.edu.sg</email></au>
</aug>
<insg>
<ins id="I1"><p>Computational and Systems Biology, Genome Institute of Singapore, Singapore 138672, Singapore</p></ins>
<ins id="I2"><p>Genome Technology and Biology, Genome Institute of Singapore, Singapore 138672, Singapore</p></ins>
<ins id="I3"><p>Cellular and Molecular Research, National Cancer Centre, Singapore 169610, Singapore</p></ins>
<ins id="I4"><p>Cancer and Stem Cell Biology Program, Duke-National University of Singapore (NUS) Graduate Medical School, Singapore 169857, Singapore</p></ins>
<ins id="I5"><p>Department of Epidemiology and Public Health, Yong Loo Lin School of Medicine, National University of Singapore, Singapore 119074, Singapore</p></ins>
<ins id="I6"><p>Neuroscience and Behavioral Disorders, Duke-NUS Graduate Medical School, Singapore 169857, Singapore</p></ins>
<ins id="I7"><p>NUS Graduate School of Integrative Sciences and Engineering, Centre for Life Sciences, Singapore 117456, Singapore</p></ins>
<ins id="I8"><p>Research Computing, Genome Institute of Singapore, Singapore 138672, Singapore</p></ins>
<ins id="I9"><p>NCCS-VARI Translational Research Laboratory, National Cancer Centre, Singapore 169610, Singapore</p></ins>
<ins id="I10"><p>Genomic Oncology, Duke-NUS Graduate Medical School, Singapore 169857, Singapore</p></ins>
<ins id="I11"><p>National Institute of Biomedical Genomics, 2nd Floor Netaji Subash Sanatorium, Kalyani 741251 West Bengal, India</p></ins>
<ins id="I12"><p>Infectious Diseases, Genome Institute of Singapore, Singapore 138672, Singapore</p></ins>
<ins id="I13"><p>Department of Surgery, Tan Tock Seng Hospital, Singapore 308433, Singapore</p></ins>
<ins id="I14"><p>Cancer Science Institute of Singapore, Yong Loo Lin School of Medicine, National University of Singapore, Singapore 119074, Singapore</p></ins>
<ins id="I15"><p>Department of Pathology, National University Health System, National University of Singapore, Singapore 119074, Singapore</p></ins>
<ins id="I16"><p>Department of Medicine, National University Health System, National University of Singapore, Singapore 119074, Singapore</p></ins>
<ins id="I17"><p>Department of Surgery, National University Health System, National University of Singapore, Singapore 119074, Singapore</p></ins>
<ins id="I18"><p>Department of Human Genetics, McGill University, Montr&#233;al H3A 1B, Canada</p></ins>
<ins id="I19"><p>McGill University and Genome Quebec Innovation Center, Montr&#233;al H3A 1A4, Canada</p></ins>
<ins id="I20"><p>Department of Biochemistry, National University of Singapore, Singapore 119074, Singapore</p></ins>
</insg>
<source>Genome Biology</source>
<issn>1465-6906</issn>
<pubdate>2012</pubdate>
<volume>13</volume>
<issue>12</issue>
<fpage>R115</fpage>
<url>http://genomebiology.com/2012/13/12/R115</url>
<xrefbib><pubidlist><pubid idtype="pmpid">23237666</pubid><pubid idtype="doi">10.1186/gb-2012-13-12-r115</pubid></pubidlist></xrefbib></bibl>
<history><rec><date><day>27</day><month>8</month><year>2012</year></date></rec><revrec><date><day>6</day><month>12</month><year>2012</year></date></revrec><acc><date><day>13</day><month>12</month><year>2012</year></date></acc><pub><date><day>13</day><month>12</month><year>2012</year></date></pub></history>
<cpyrt><year>2012</year><collab>et al.; licensee BioMed Central Ltd.</collab><note>This is an open access article distributed under the terms of the Creative Commons Attribution License (<url>http://creativecommons.org/licenses/by/2.0</url>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</note></cpyrt>
<abs>
<sec><st><p>Abstract</p></st>
<sec><st><p>Background</p></st>
<p>Gastric cancer is the second highest cause of global cancer mortality. To explore the complete repertoire of somatic alterations in gastric cancer, we combined massively parallel short read and DNA paired-end tag sequencing to present the first whole-genome analysis of two gastric adenocarcinomas, one with chromosomal instability and the other with microsatellite instability.</p>
</sec>
<sec><st><p>Results</p></st>
<p>Integrative analysis and <it>de novo </it>assemblies revealed the architecture of a wild-type <it>KRAS </it>amplification, a common driver event in gastric cancer. We discovered three distinct mutational signatures in gastric cancer - against a genome-wide backdrop of oxidative and microsatellite instability-related mutational signatures, we identified the first exome-specific mutational signature. Further characterization of the impact of these signatures by combining sequencing data from 40 complete gastric cancer exomes and targeted screening of an additional 94 independent gastric tumors uncovered <it>ACVR2A</it>, <it>RPL22 </it>and <it>LMAN1 </it>as recurrently mutated genes in microsatellite instability-positive gastric cancer and <it>PAPPA </it>as a recurrently mutated gene in <it>TP53 </it>wild-type gastric cancer.</p>
</sec>
<sec><st><p>Conclusions</p></st>
<p>These results highlight how whole-genome cancer sequencing can uncover information relevant to tissue-specific carcinogenesis that would otherwise be missed from exome-sequencing data.</p>
</sec>
</sec>
</abs>
</fm>
<bdy>
<sec><st><p>Background</p></st>
<p>Gastric cancer (GC) is the fourth most common cancer and the second leading cause of cancer death worldwide. Early stage GC is often asymptomatic or associated with non-specific symptoms, resulting in most patients presenting at advanced disease stages. Treatment options for late-stage GC patients are limited, with surgery and chemotherapy regimens offering modest survival benefits. Environmental risk factors for GC include a high salt diet, smoking, and infection by <it>Helicobacter pylori </it><abbrgrp><abbr bid="B1">1</abbr></abbrgrp>. Understanding the mutational impact of these environmental exposures on the genomes of gastric epithelial cells is essential to shed light on specific genes and pathways associated with gastric tumorigenesis.</p>
<p>Previous studies in lung cancer <abbrgrp><abbr bid="B2">2</abbr><abbr bid="B3">3</abbr></abbrgrp>, melanoma <abbrgrp><abbr bid="B4">4</abbr></abbrgrp>, and leukemia <abbrgrp><abbr bid="B5">5</abbr></abbrgrp> have shown that environmental carcinogens and drugs can elicit specific somatic mutational profiles in cancer genomes, referred to as 'mutational signatures'. While previous studies on GC have applied exome-sequencing approaches to identify frequently mutated genes <abbrgrp><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr></abbrgrp>, identifying mutational signatures is best done using whole-genome data, due to its completeness and ability to simultaneously uncover micro- and macro-scale somatic alterations. In this study, we sought to provide a more comprehensive understanding of mutational processes in GC by analyzing whole-genome sequences of two GCs and their matched-normal controls, using both short-read (SR) next-generation sequencing and a long insert (approximately 10 kbp) DNA paired-end tag (DNA-PET) protocol <abbrgrp><abbr bid="B8">8</abbr></abbrgrp>. We also sought to explore the combination of these datasets for <it>de novo </it>assembly of cancer and normal genomes and to comprehensively catalogue a range of (point mutations to megabase-sized) somatic alterations in the tumor. Finally, we used this catalogue to characterize the impact of mutational processes on genes and used a screening approach to validate recurrently mutated genes in subtypes of GC defined by specific mutational processes.</p>
</sec>
<sec><st><p>Results</p></st>
<sec><st><p>Integrative short read/DNA-PET analysis and <it>de novo </it>assembly</p></st>
<p>The matched tumor and normal samples analyzed were from two Singaporean patients. One GC exhibited evidence of microsatellite instability (MSI) and active <it>H. pylori </it>infection (see Table S1 in Additional file <supplr sid="S1">1</supplr> for other clinical characteristics). Each tumor and matched normal sample was sequenced to more than 30-fold average base pair coverage by Illumina SR sequencing (Materials and methods; Table S2 in Additional file <supplr sid="S1">1</supplr>), and to &gt; 130-fold physical coverage using large-insert (approximately 10 kbp) DNA-PET sequencing <abbrgrp><abbr bid="B9">9</abbr></abbrgrp> on the SOLiD platform (Materials and methods; Table S3 and Note 1 in Additional file <supplr sid="S1">1</supplr>). Single nucleotide variants (SNVs) and short insertions and deletions (indels) from tumor and normal genomes were combined to identify somatic variants (Table <tblr tid="T1">1</tblr> and Materials and methods) and reliability of somatic calls was confirmed using targeted sequencing (validation rate of 90% for SNVs and 96% for indels; Materials and methods). SR and DNA-PET data were also used to identify somatic copy-number variations (CNVs) and structural variations (SVs) (validation rate = 81%; Materials and methods; Note 1 in Additional file <supplr sid="S1">1</supplr>).</p>
<suppl id="S1">
<title><p>Additional file 1</p></title>
<text><p><b>Supplementary Methods, Tables and Figures</b>.</p></text>
<file name="gb-2012-13-12-r115-S1.DOCX">
   <p>Click here for file</p>
</file>
</suppl>
<tbl id="T1"><title><p>Table 1</p></title><caption><p>Somatic variations in two GC tumors identified by whole genome sequencing approaches</p></caption><tblbdy cols="3">
      <r>
         <c ca="left">
            <p>
               <b>Patient ID</b>
            </p>
         </c>
         <c ca="center">
            <p>
               <b>NGCII082</b>
            </p>
         </c>
         <c ca="center">
            <p>
               <b>NGCII092</b>
            </p>
         </c>
      </r>
      <r>
         <c cspan="3">
            <hr/>
         </c>
      </r>
      <r>
         <c ca="left">
            <p>SNVs, all somatic</p>
         </c>
         <c ca="center">
            <p>14,856</p>
         </c>
         <c ca="center">
            <p>17,473</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Coding regions</p>
         </c>
         <c ca="center">
            <p>119</p>
         </c>
         <c ca="center">
            <p>116</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Non-synonymous</p>
         </c>
         <c ca="center">
            <p>86</p>
         </c>
         <c ca="center">
            <p>73</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Promoter regions</p>
         </c>
         <c ca="center">
            <p>101</p>
         </c>
         <c ca="center">
            <p>161</p>
         </c>
      </r>
      <r>
         <c ca="left">
            <p>Indels, all somatic</p>
         </c>
         <c ca="center">
            <p>11,738</p>
         </c>
         <c ca="center">
            <p>2,486</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Coding regions</p>
         </c>
         <c ca="center">
            <p>12</p>
         </c>
         <c ca="center">
            <p>2</p>
         </c>
      </r>
      <r>
         <c ca="left">
            <p>CNVs, all somatic</p>
         </c>
         <c ca="center">
            <p>836</p>
         </c>
         <c ca="center">
            <p>21,776</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Affecting genes</p>
         </c>
         <c ca="center">
            <p>3</p>
         </c>
         <c ca="center">
            <p>265</p>
         </c>
      </r>
      <r>
         <c ca="left">
            <p>SVs, all somatic</p>
         </c>
         <c ca="center">
            <p>12</p>
         </c>
         <c ca="center">
            <p>146</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Affecting genes</p>
         </c>
         <c ca="center">
            <p>11</p>
         </c>
         <c ca="center">
            <p>96</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Deletions</p>
         </c>
         <c ca="center">
            <p>6</p>
         </c>
         <c ca="center">
            <p>56</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Tandem duplications</p>
         </c>
         <c ca="center">
            <p>2</p>
         </c>
         <c ca="center">
            <p>8</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Unpaired inversions</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
         <c ca="center">
            <p>26</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Inversions</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
         <c ca="center">
            <p>2</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Insertions (intra-chromosomal)</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Insertions (inter-chromosomal)</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Isolated translocations</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
         <c ca="center">
            <p>3</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Balanced translocations</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Complex events (intra- chromosomal)</p>
         </c>
         <c ca="center">
            <p>4</p>
         </c>
         <c ca="center">
            <p>49</p>
         </c>
      </r>
      <r>
         <c ca="left" indent="1">
            <p>Complex events (inter- chromosomal)</p>
         </c>
         <c ca="center">
            <p>0</p>
         </c>
         <c ca="center">
            <p>2</p>
         </c>
      </r>
   </tblbdy></tbl>
<p>We integrated the SR and DNA-PET sequence information to perform <it>de novo </it>assembly of the tumor and normal genomes. While complete <it>de novo </it>assembly of a tumor genome still poses significant technical challenges and has not been attempted before, we were able to use the SR/DNA-PET data to construct highly contiguous draft assemblies of median scaffold lengths (N50) in the range of 41 to 148 kb, with DNA-PET data assisting in tripling sequence contiguity of the assemblies (Materials and methods; Note 2 and Table S5 in Additional file <supplr sid="S1">1</supplr>). Importantly, performing <it>de novo </it>SR/DNA-PET assembly revealed several findings not observed using conventional analyses of the SR data. First, the <it>de novo </it>approach allowed for characterization of large-scale somatic structural variations at single base-pair resolution (SR libraries were unable to identify nearly half of the validated SVs and fusions genes; Note 1 in Additional file <supplr sid="S1">1</supplr>). For example, NGCII092 exhibited a focal genomic amplification on chromosome 12p11-12 in a region containing the wild-type <it>KRAS </it>gene, a genomic event frequently observed in GC <abbrgrp><abbr bid="B10">10</abbr></abbrgrp>. The combined SR/DNA-PET data (Materials and methods) enabled a detailed putative reconstruction of the evolutionary lineage of the amplified <it>KRAS </it>locus with concomitant deletion of a proposed tumor suppressor gene <it>RASSF8 </it>(as well as another focal amplicon at chromosome 6p) as described in the supplementary text (Figure <figr fid="F1">1</figr>; Figures S1 and S2 and Note 3 in Additional file <supplr sid="S1">1</supplr>). Reconstruction of the tumor genomes also allowed the prediction of fusion genes and complex rearrangements that resemble patterns created by replication coupled mechanisms <abbrgrp><abbr bid="B11">11</abbr></abbrgrp> and are further described in the supplementary text (Note 4 and Figures S3 and S4 in Additional file <supplr sid="S1">1</supplr> and Table S6 in Additional file <supplr sid="S2">2</supplr>).</p>
<fig id="F1"><title><p>Figure 1</p></title><caption><p>Copy number of two gastric cancer genomes, mechanism of 12p amplification and creation of a fusion gene</p></caption><text>
   <p><b>Copy number of two gastric cancer genomes, mechanism of 12p amplification and creation of a fusion gene</b>. <b>(a) </b>Somatic CNVs in the two gastric tumors (chromosomes are arranged on the x-axis, copy number is shown on the y-axis). <b>(b) </b>Copy number of chromosome 12 (top) and the amplicon on 12p (middle) are shown in orange (y-axis). Rearrangements identified by DNA-PET clusters with a size &#8805; 45 are represented by arrows and connecting lines (bottom). Dark red and pink arrows represent 5' and 3' cluster regions, respectively, with the connection between the tip of the dark red and the blunt end of the pink arrows. Numbers represent cluster sizes. <b>(c) </b>Fusion between <it>SOX5 </it>and <it>OVCH1 </it>predicted by a rearrangement point with cluster size of 129 in (b).</p>
</text><graphic file="gb-2012-13-12-r115-1"/></fig>
<suppl id="S2">
<title><p>Additional file 2</p></title>
<text><p><b>Table S6. Details of somatic SVs identified by DNA-PET in gastric tumors NGCII082 and NGCII092</b>.</p></text>
<file name="gb-2012-13-12-r115-S2.XLS">
   <p>Click here for file</p>
</file>
</suppl>
<p>Second, a combined SR/DNA-PET analysis allowed us to assemble sequences present in the tumor genome but not in the reference human genome. For example, in patient NGCII082 exhibiting active <it>H. pylori </it>infection, we detected approximately 2,000 short-sequence reads and &gt; 600 DNA-PET tags corresponding to the <it>H. pylori </it>genome (the first such report for a bacterial pathogen from tumor sequencing), in addition to a tumor-associated microbiome (these were not seen in NGCII092; see Figure S5 and Note 5 in Additional file <supplr sid="S1">1</supplr> for details). Note that, despite being fewer in number, the DNA-PET tags contributed significantly to the physical coverage and analysis of the genomes (Figure S5 and Note 5 in Additional file <supplr sid="S1">1</supplr>).</p>
<p>Third, the <it>de novo </it>assembly enabled annotation of human genes and variants in sequences absent in the reference genome. In total, we identified more than 3 Mbp of novel sequence (longer than 500 bp), containing several genes (including an ortholog to a cytokine receptor-like factor - <it>CRLF2</it>), and more than a 1,000 somatic and germline variants for each patient (Materials and methods; Note 2 and Table S5 in Additional file <supplr sid="S1">1</supplr>).</p>
</sec>
<sec><st><p>Mutational signatures of damage by reactive oxygen species, deamination and microsatellite instability</p></st>
<p>We characterized mutational signatures in the GC genomes based on 14,856 somatic SNVs (11,738 indels) in NGCII082 and 17,473 somatic SNVs (2,486 indels) in NGCII092 that were identified from the whole-genome data (Table <tblr tid="T1">1</tblr>). This accounts for an average mutation frequency of 5 per megabase and included &gt; 100 SNVs in protein coding regions for each tumor (Table <tblr tid="T1">1</tblr>; Note 6 in Additional file <supplr sid="S1">1</supplr>). Note that we identified more than five times the number of somatic variants uncovered in earlier sequencing studies <abbrgrp><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr></abbrgrp> that were restricted to exomes (5,588 SNVs and 2,347 indels identified from 37 exomes), highlighting the statistical advantage of whole-genome analysis for studying mutational signatures. Overall, NGCII082, an MSI-positive tumor, displayed an excess of SNVs in protein coding regions (<it>P</it>-value &lt; 0.02, &#967;<sup>2 </sup>test) and a striking seven-fold higher frequency of micro-indels (Figures <figr fid="F2">2</figr> and <figr fid="F3">3d</figr>) but a lack of large-scale SVs and amplifications or deletions (Figure <figr fid="F2">2</figr> and Table <tblr tid="T1">1</tblr>). In contrast, NGCII092 exhibited a complex copy number profile of extensive focal amplifications and deletions, and a mutated <it>TP53 </it>gene, consistent with the presence of chromosomal instability (CIN) in the tumor genome (Figure <figr fid="F2">2</figr>). These results agree with the mutual exclusivity seen in MSI and CIN pathways for inducing mutations in other cancers as well <abbrgrp><abbr bid="B12">12</abbr></abbrgrp>.</p>
<fig id="F2"><title><p>Figure 2</p></title><caption><p>Map of somatic alterations in two gastric cancer genomes</p></caption><text>
   <p><b>Map of somatic alterations in two gastric cancer genomes</b>. The Circos plots depict the following information in order from outer to inner rings: using WGS data (1) CNVs (gain in red capped at 10 copies and loss in gray), (2) indel density (indel frequency per 10 kbp in blue, capped at 5 indels/10 kbp), (3) SNV density (SNV frequency per 10 kbp in black, each ring is 5 SNVs/10 kbp, capped at 10), and using DNA-PET data, (4) deletions (in red), tandem duplications (green) and inversions (purple), (5) intra- and (6) inter-chromosomal, insertions (orange) and unpaired SVs (gray).</p>
</text><graphic file="gb-2012-13-12-r115-2"/></fig>
<fig id="F3"><title><p>Figure 3</p></title><caption><p>Genome-wide and exome-wide mutational fingerprint</p></caption><text>
   <p><b>Genome-wide and exome-wide mutational fingerprint</b>. <b>(a) </b>Frequency of various classes of somatic SNVs genome-wide. <b>(b) </b>Frequency of somatic SNVs exome-wide. <b>(c) </b>Mutational bias as a function of infection status using data from 34 exomes (bias for SNV class <it>i </it>was computed as (s<sub>i </sub>- g<sub>i</sub>)/g<sub>i</sub>, where s<sub>i </sub>and g<sub>i </sub>are the somatic and germline SNV frequencies). Note that nearly identical results were obtained when MSI tumors were excluded from the analysis (*<it>P</it>-value &lt; 0.1; **<it>P</it>-values &lt; 0.01, respectively). <b>(d) </b>Size-distribution of germline and somatic indels genome-wide.</p>
</text><graphic file="gb-2012-13-12-r115-3"/></fig>
<p>The clear excess of micro-indels in the MSI-positive GC (Figure <figr fid="F3">3d</figr>; Figure S10 in Additional file <supplr sid="S1">1</supplr>) was characterized by a pattern of single base-pair thymine deletions in mononucleotide repeats (79%). In contrast, there were a comparable number of insertions in both the MSI-positive and CIN-positive GC, and a similar deletion-specific pattern has also been noted before <abbrgrp><abbr bid="B13">13</abbr></abbrgrp>. Also, non-thymine and non-mononucleotide repeat deletions were not found to be in excess. The correlation between MSI phenotype and the specific deletion signature identified here was further confirmed from previous exome-sequencing data <abbrgrp><abbr bid="B7">7</abbr></abbrgrp> (four MSI-positive exomes), though this aspect was not noted in the previous work. In terms of genomic location, the deletions were randomly scattered throughout the genome and occurred in proportion to the regional presence of thymine mononucleotide repeats (that is, 85% of homopolymers &gt; 5 bp). Thus, despite the bias towards thymine deletions, there seems to be an absence of a targeting mechanism on the genome for the MSI-associated signature.</p>
<p>Despite exhibiting very different somatic alteration patterns (MSI or CIN), the mutational frequencies of both GCs at the single nucleotide level were highly similar, being significantly biased towards C &gt; A and T &gt; A alterations compared to normal genomes (<it>P</it>-value &lt; 10<sup>-16</sup>, &#967;<sup>2 </sup>test; Figure <figr fid="F3">3a</figr>). These alterations likely represent mutations caused by reactive oxygen and nitrogen species (ROS and RNS), which are known to produce C &gt; A and T &gt; A mutations <abbrgrp><abbr bid="B14">14</abbr></abbrgrp>. Also, a likely trigger is <it>H. pylori </it>infection, which has been shown to cause chronic inflammation and ROS/RNS production in gastric epithelial cells <abbrgrp><abbr bid="B14">14</abbr></abbrgrp>. The C &gt; A mutations observed were associated with highly significant sequence-selectivity, being marked by an excess at CpCpT (NGCII082, odds ratio (OR) = 3.2, <it>P</it>-value &lt; 10<sup>-16</sup>, &#967;<sup>2 </sup>test) or TpCpA sites (NGCII092, OR = 1.7, <it>P</it>-value &lt; 10<sup>-16</sup>, &#967;<sup>2 </sup>test) and extensions of these motifs (Materials and methods; Note 6 and Figure S6 in Additional file <supplr sid="S1">1</supplr> and Table S14 in Additional file <supplr sid="S6">6</supplr>). This pattern is distinct from the C &gt; A signature seen in smoking-associated small-cell lung cancer where an excess was seen in CpG dinucleotides outside CpG islands, suggesting a link with methylation status <abbrgrp><abbr bid="B2">2</abbr><abbr bid="B3">3</abbr></abbrgrp>. Further work is required to identify the mechanistic basis of sequence selectivity in this genome-wide GC-specific signature.</p>
<suppl id="S6">
<title><p>Additional file 6</p></title>
<text><p><b>Table S14. Enriched bases and motifs in the neighbourhood of C &gt; A mutations</b>.</p></text>
<file name="gb-2012-13-12-r115-S6.XLS">
   <p>Click here for file</p>
</file>
</suppl>
<sec><st><p>Exome-biased mutational signature in GC</p></st>
<p>Unlike the MSI and ROS/RNS signatures that were present in coding and non-coding regions of the genome, we also detected a third GC mutational signature only evident in coding regions (Figure <figr fid="F3">3b</figr>), characterized by an excess of C &gt; T mutations. These mutations were in excess at CpG (NGCII082, OR = 1.2, <it>P</it>-value &lt; 10<sup>-16</sup>, &#967;<sup>2 </sup>test) and GpC site (NGCII092, OR = 1.4, <it>P</it>-value &lt; 10<sup>-16</sup>, &#967;<sup>2 </sup>test) dinucleotides. The CpG alterations likely represent deamination of methylated cytosines followed by errors associated with transcription-coupled repair, which has also been observed in other cancers <abbrgrp><abbr bid="B2">2</abbr><abbr bid="B4">4</abbr></abbrgrp>. However, the latter bias towards C &gt; T alterations occurring at GpC motifs appears to be a unique feature not previously reported in other cancers <abbrgrp><abbr bid="B2">2</abbr><abbr bid="B4">4</abbr></abbrgrp> and could represent deamination due to enzymes such as AID (activation-induced cytidine deaminase) <abbrgrp><abbr bid="B15">15</abbr></abbrgrp>. AID is known to preferentially target transcribed regions <abbrgrp><abbr bid="B16">16</abbr></abbrgrp> and is aberrantly activated due to <it>H. pylori </it>infection in the gastric epithelium <abbrgrp><abbr bid="B17">17</abbr></abbrgrp>. Taken collectively, our whole-genome sequencing data implicates a minimum of three mutational signatures present in GC genomes, related to the presence of MSI, ROS/RNS, and deamination processes.</p>
<p>To further characterize the mutational signatures, we re-analyzed a total of 40 GC exomes, combining data from earlier studies <abbrgrp><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr></abbrgrp> with two new exomes in this study (Materials and methods; Table S8 and Figure S7 in Additional file <supplr sid="S1">1</supplr>). Specifically, a comparison of somatic and germline frequencies for the exomes showed that all but one patient had a significant excess of C &gt; A (ROS/RNS-related) or C &gt; T (deamination-related) alterations and 23 GCs (&gt; 50%) had an excess of both mutations (Fisher's exact test <it>P</it>-value &lt; 0.01), establishing these two mutational classes as the most significant single-nucleotide alterations in GC. These patterns were independent of histological subtype (intestinal, diffuse and mixed-type) and MSI status (the excess is also seen in all but one non-MSI tumor). Moreover, the frequencies of C &gt; T and C &gt; A mutations were significantly different in GCs with active <it>H. pylori </it>infection compared to those lacking active infection (Wilcoxon rank sum test <it>P</it>-value &lt; 0.006 and 0.06, respectively; Figure <figr fid="F3">3c</figr>). Overall, these results support the widespread role of ROS/RNS-associated C &gt; A and deamination-associated C &gt; T mutations in gastric cancer and are suggestive of their link to <it>H. pylori </it>infection.</p>
<p>A strong signature for transcriptional-coupled repair has been described before in other cancers <abbrgrp><abbr bid="B2">2</abbr><abbr bid="B4">4</abbr></abbrgrp> and our analysis also confirmed this in GC, in that poorly transcribed regions of the genome were associated with significantly more mutations (Figure S8 and Note 8 in Additional file <supplr sid="S1">1</supplr>). However, in contrast with earlier reports, we did not see a significant bias for mutations in the transcribed versus non-transcribed strand in most mutational classes (except for T &gt; G, <it>P</it>-value &lt; 0.05, &#967;<sup>2 </sup>test; Figure S8 in Additional file <supplr sid="S1">1</supplr>). The absence of this latter pattern may be a consequence of the higher mutational burden from mutagens that also act in a transcription-coupled fashion (for example, AID <abbrgrp><abbr bid="B16">16</abbr></abbrgrp>).</p>
</sec>
</sec>
<sec><st><p>Impact of mutational signatures on genes in GC</p></st>
<p>The overall impact of the mutational signatures identified here on gastric tumorigenesis is a complex question influenced by several factors, including the nature of mutations, the function of genes that are frequently impacted as well as genetic background and selection processes. We aimed to provide an initial assessment using two approaches: (i) by characterizing the proportion of genes affected by various mutational classes; and (ii) by identifying recurrently mutated genes in subtypes of GC defined by mutational processes.</p>
<p>Overall, a majority of mutated genes in NGCII082 were due to SNVs (77%) while CNVs and SVs played a dominant role in NGCII092 (82%) (Table <tblr tid="T1">1</tblr>). In total, we identified 107 SVs that affected genes by truncation, fusion, deletion, tandem duplication or rearrangements within the gene body. Ninety-six (90%) of these were identified in the CIN phenotype exhibiting tumor NGCII092, illustrating the genic burden from this mutational process. In contrast, small insertions and deletions (indels) were seen in few genes, even in the tumor with MSI phenotype (despite indels being roughly as common as SNVs genome-wide; Table <tblr tid="T1">1</tblr>), though their ability to cause frameshifts is likely to impact gene function more often than SNVs. Among SNVs, even though the deamination-related C &gt; T signature is only seen in a small fraction of the genome, it plays a larger role in GC due to its targeted impact on genes. More than 48% of the non-synonymous mutations seen (48% in NGCII092 and 59% in NGCII082) in the two tumors were due to C &gt; T mutations, compared to less than 19% for C &gt; A mutations (Table <tblr tid="T1">1</tblr>). Among recurrently mutated genes in GC (Table S7 in Additional file <supplr sid="S1">1</supplr> and Table S9 in Additional file <supplr sid="S3">3</supplr>), non-synonymous mutations in the tumor suppressor genes <it>TP53 </it>(mutated in 50% of samples) and <it>PTEN </it>(18% of samples), and oncogenes <it>PIK3CA </it>(13%; 8% have <it>PTEN </it>and <it>PIK3CA </it>mutations) and <it>CTNNB1 </it>(10%) were often in the form of C &gt; T mutations (29%). This was also seen in several novel recurrently mutated genes such as <it>AQP7</it>, <it>SPTA1 </it>and <it>RP1L1 </it>(mutated in &gt; 10% of tumors; Table S7 in Additional file <supplr sid="S1">1</supplr>).</p>
<suppl id="S3">
<title><p>Additional file 3</p></title>
<text><p><b>Table S9. Genes recurrently mutated by non-synonymous SNVs or indels in four or more patients out of 40 GC exomes</b>.</p></text>
<file name="gb-2012-13-12-r115-S3.XLS">
   <p>Click here for file</p>
</file>
</suppl>
<p>Pathway analysis of mutated genes revealed that the two most enriched sets were &#946;1-integrin mediated cell-surface interactions and signaling events mediated by class III histone deacetylases, a refinement of previous analysis <abbrgrp><abbr bid="B7">7</abbr></abbrgrp> (Table S10 in Additional file <supplr sid="S4">4</supplr>). Furthermore, we identified genes implicated in <it>RAC1 </it>regulation to be mutated in 83% of <it>H. pylori </it>positive samples (<it>P</it>-value &lt; 0.05 Fisher's exact test). <it>RAC1 </it>is a member of the Rho GTPase family known to play diverse oncogenic roles <abbrgrp><abbr bid="B18">18</abbr></abbrgrp>, shown to regulate the <it>H. pylori </it>virulence factor <it>VacA</it>, and known to promote vacuole formation in epithelial cells <abbrgrp><abbr bid="B19">19</abbr></abbrgrp>. Mutations in the <it>RAC1 </it>pathway could thus simultaneously promote <it>H. pylori </it>infection as well as gastric tumorigenesis.</p>
<suppl id="S4">
<title><p>Additional file 4</p></title>
<text><p><b>Table S10. Enriched functions and pathways in Gastric Cancer</b>.</p></text>
<file name="gb-2012-13-12-r115-S4.XLS">
   <p>Click here for file</p>
</file>
</suppl>
<p>Finally, to further characterize the impact of mutational processes on genes in GC, we considered two specific subtypes for identifying recurrently mutated genes, MSI-positive GC and <it>TP53</it>-wild-type GC (Tables S11 and S13 in Additional file <supplr sid="S1">1</supplr> and Table S12 in Additional file <supplr sid="S5">5</supplr>). We used <it>TP53</it>-wild-type status as a surrogate marker for tumors without the CI phenotype as <it>TP53 </it>is known to suppress chromosomal instability <abbrgrp><abbr bid="B20">20</abbr></abbrgrp>. In this class of GCs, in addition to the tumor suppressor gene <it>PTEN </it>and <it>TTK </it>that interact with <it>TP53</it>, we identified <it>PAPPA</it>, a marker for pregnancies with aneuploid fetuses <abbrgrp><abbr bid="B21">21</abbr></abbrgrp>, as being recurrently mutated (Table S13 in Additional file <supplr sid="S1">1</supplr>; note that the average mutation rate for the whole-genome sequencing (WGS) samples in an approximately 2 Mbp window surrounding <it>PAPPA </it>is similar to the genome-wide rate, that is 5.3 versus 5.2 mutations/Mbp). A screen of an additional 94 gastric cancer/normal pairs confirmed the frequency of <it>PAPPA </it>mutations as being 6% among all GC samples (Table S12 in Additional file <supplr sid="S5">5</supplr>) and 20% among <it>TP53 </it>wild-type GCs (with mutations in key functional domains; Figures S13 and S14 in Additional file <supplr sid="S1">1</supplr>), highlighting it as a potential driver gene in this subtype.</p>
<suppl id="S5">
<title><p>Additional file 5</p></title>
<text><p><b>Table S12. Screen for recurrent mutations in 94 GC tumor/normal pairs by Sanger sequencing</b>.</p></text>
<file name="gb-2012-13-12-r115-S5.XLS">
   <p>Click here for file</p>
</file>
</suppl>
<p>In MSI-positive GCs, <it>ACVR2A</it>, <it>RPL22</it>, <it>LMAN1</it>, and <it>STAU2 </it>were observed to have recurrent single base thymine deletions in poly(T) regions (Table S11 in Additional file <supplr sid="S1">1</supplr>) and this was confirmed in a screen of an additional 94 gastric cancer/normal paired samples (9 MSI-positive; Table S12 in Additional file <supplr sid="S5">5</supplr> and Figure S9 and Note 9 in Additional file <supplr sid="S1">1</supplr>). In total, <it>ACVR2A </it>was mutated in a region of 8 thymines in 86% of MSI-positive GCs tumors, <it>RPL22 </it>in a region of 8 thymines in 64%, <it>LMAN1 </it>in a region of 9 thymines in 50% and <it>STAU2 </it>in a region of 8 thymines in 29%. Based on the average frequency of mutations in homopolymer regions in the MSI-positive tumors (4.5% of 8 thymine stretches (<it>n </it>= 778) and 4.8% of 9 thymine stretches (<it>n </it>= 183), respectively, in exomic regions), mutations in <it>ACVR2A</it>, <it>RPL22 </it>and <it>LMAN1 </it>were in significant excess (Bonferroni-corrected <it>P</it>-value &#8804; 0.0003, exact binomial test). In each gene, all the deletions occurred in the same homopolymer tract containing thymines, a pattern linked to the MSI phenotype, and none of the MSI-negative GC tumors carried these mutations. In contrast, mutations in the recently reported MSI-associated putative driver gene <it>ARID1A </it>were not restricted to deletions or MSI-positive tumors <abbrgrp><abbr bid="B7">7</abbr></abbrgrp>. Interestingly, <it>ACVR2A </it>(encoding a TGF-&#946; super-family differentiation factor) has been described to be recurrently mutated in MSI-positive colorectal cancer <abbrgrp><abbr bid="B22">22</abbr></abbrgrp>. Also, the frequency of mutations seen here is comparable to the previously reported frequency in MSI-positive colorectal cancer <abbrgrp><abbr bid="B23">23</abbr><abbr bid="B24">24</abbr></abbrgrp> and emphasizes the importance of <it>ACVR2A </it>and TGF-&#946; signaling in MSI-positive GC, while unraveling the oncogenic roles of <it>RPL22 </it>and <it>LMAN1 </it>requires further investigation.</p>
</sec>
</sec>
<sec><st><p>Discussion</p></st>
<p>Until long read sequencing of several kilo-base pairs is routine, the combination of SR and long fragment mate-pair sequencing remains the most powerful approach to comprehensively capture micro- and macro-scale alterations in the cancer genome. The combination of SR and DNA-PET sequencing in this study thus provides the first comprehensive assessment of somatic alterations in GC. In particular, our results highlight the importance of whole-genome analysis for reconstructing the lineage of complex somatic structural variants and characterizing mutational process and their genomic impact in cancer. For example, while point mutations in the <it>KRAS </it>gene have been well characterized, our whole-genome analysis enabled the first detailed reconstruction of amplification in the <it>KRAS </it>locus (a common event in GC) and a concomitant deletion of a proposed tumor suppressor gene <it>RASSF8</it>.</p>
<p>The analysis of several exome-sequencing datasets in earlier studies <abbrgrp><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr></abbrgrp> was able to provide only a limited view of mutational processes in GC. Whole-genome analysis was essential for providing sufficient detail and statistics to identify the features and relative impact of the various mutational processes (for example, MSI, ROS/RNS and CI). This is best exemplified by the identification of a uniquely localized, deamination-linked mutational fingerprint whose significance would have been missed in an exome-based study. We further characterized the impact of this mutational process and identified the recurrently mutated genes <it>PAPPA</it>, <it>ACVR2A</it>, <it>RPL22</it>, <it>LMAN1</it>, and <it>STAU2 </it>in subtypes of GC defined by mutational processes.</p>
</sec>
<sec><st><p>Conclusions</p></st>
<p>While computational tools for <it>de novo </it>cancer genome assembly are limited, its utility is demonstrated by our reconstruction of the <it>H. pylori </it>strain genome and assembly-based characterization of SVs and fusion genes at the base pair level. As sequencing costs continue to drop, whole-genome sequencing and assembly of affected tissues can serve as a tool for biomarker and pathogen discovery in cancer and other diseases. Assembly tools need to be refined to address the twin challenges of genomic amplifications and mixed cell populations and the availability of whole-genome SR and DNA-PET data from the clinical samples in this study should serve as a useful resource in this effort.</p>
</sec>
<sec><st><p>Materials and methods</p></st>
<sec><st><p>Patient samples and clinical information</p></st>
<p>Patient samples and clinical information on tissue and blood samples were obtained from patients who had undergone surgery for gastric cancer at the National University Hospital, Singapore, and Tan Tock Seng Hospital, Singapore. Informed consent was obtained from all subjects and the study was approved by the Institutional Review Board of the National University of Singapore (reference code 05-145) as well as the National Healthcare Group Domain Specific Review Board (reference code 2005/00440). Clinical information for the two patients whose samples were analyzed by whole-genome sequencing is provided in Table S1 in Additional file <supplr sid="S1">1</supplr> and additional information for the 94 gastric tumors used for targeted screening is provided in Table S12 in Additional file <supplr sid="S5">5</supplr>.</p>
</sec>
<sec><st><p>Library preparation and sequencing</p></st>
<p>For WGS sequencing, genomic DNA isolated from tumor and blood samples was randomly fractionated using a Roche Nebulizer following the manufacturer's instructions (Madison, Wisconsin, USA). Fractionated DNA was then end-repaired, A-tailed at the 3' end, ligated with Illumina paired end adaptors, PCR amplified followed by gel-selection of a range of 400 to 600 bp fragments as templates and sequenced by Illumina GA from both ends to obtain 76 or 101 bp reads at each end (Table S2 in Additional file <supplr sid="S1">1</supplr>). DNA-PET libraries were constructed as described elsewhere <abbrgrp><abbr bid="B9">9</abbr></abbrgrp> and were sequenced by the Applied Biosystems SOLiD system (Carlsbad, California, USA, Table S3 in Additional file <supplr sid="S1">1</supplr>). Exome sequencing was performed as described earlier using SureSelect Human All Exon Kit v1 (Agilent Technologies, Santa Clara, California, USA) and sequencing on two lanes of Illumina GA-IIx sequencer using 76 bp paired-end reads <abbrgrp><abbr bid="B6">6</abbr></abbrgrp>.</p>
</sec>
<sec><st><p>Mapping and variant calling</p></st>
<p>Paired-end Illumina reads were mapped to the reference human genome (UCSC hg18) using ELAND (Illumina Inc.) and reads that failed pass-filter were removed from further analysis. SNVs and indels were called for each sample separately using SAMtools <abbrgrp><abbr bid="B25">25</abbr></abbrgrp> (v0.1.7-6, SNP-quality threshold = 20, consensus-quality threshold = 30) (Table S4 in Additional file <supplr sid="S1">1</supplr>). Identical variant calls in tumor and matched normal samples were used to identify germline variants. Variant calls unique to the tumor, where the normal genotype called by SAMtools was different and where less than two reads of the variant genotype were seen in the normal sample, provided the list of somatic variants. Illumina reads from exome sequencing were analyzed using this pipeline after BWA <abbrgrp><abbr bid="B26">26</abbr></abbrgrp> mapping (Table S8 in Additional file <supplr sid="S1">1</supplr>). As a control, we noted that germline SNV frequencies were nearly identical across all exomes from WGS and exome sequencing datasets (Figure S7 in Additional file <supplr sid="S1">1</supplr>). Somatic SNV frequencies and neighborhoods were compared to germline frequencies to assess enrichment. A neighborhood of up to 2 bp surrounding an SNV was used to identify enriched motifs. Somatic indel calls were required to be supported by at least 20% of the reads, by reads on both strands, with a minimum of 10 reads overlapping the position in the tumor and no indel calls in the normal sample. Somatic SNVs and indels in protein-coding regions and introns were confirmed by Sanger sequencing to have a high validation rate (83 SNVs, validation rate = 90%; 72 indels, validation rate = 96%). SNV neighborhood analysis was done by extracting 5 bp sequences upstream and downstream of mutations. Germline and somatic copy number variants were identified using the program RDXplorer <abbrgrp><abbr bid="B27">27</abbr></abbrgrp> with default parameters.</p>
<p>DNA-PET tags were mapped individually to the reference human genome (UCSC hg18) in color space allowing two color code mismatches per tag by the SOLiD System Analysis Pipeline Tool Corona Lite (Applied Biosystems Inc.). Contigs of the reference sequence with unresolved location (random_chr) and alternative MHC haplotypes were excluded from the reference for mapping. Individually mapped tags were paired by Corona Lite. In cases where one or both tags had multiple mapping locations, a process termed 'rescuing' favored the creation of concordant PETs (both tags are on the same chromosome, same strand, same orientation, correct 5' &#8594; 3' order and in the expected distance to each other).</p>
<p>SVs, based on clusters of non-concordant PETs, were called using the GIS DNA-PET pipeline <abbrgrp><abbr bid="B9">9</abbr></abbrgrp> with refined quality control criteria: (i) PET clusters of size &lt; 6 were excluded; (ii) the regions to which the 5' and 3' tags of a cluster mapped had to be at least 1 kbp in size each; (iii) PET clusters that had a supercluster (connected component of overlapping clusters <abbrgrp><abbr bid="B9">9</abbr></abbrgrp>) size &gt; 100 required a higher cluster size of 10; and (iv) PET clusters with high sequence similarity between the two fused regions (BLAST score &gt; 2,000 for 20 kbp windows around the predicted break points) were excluded. To distinguish between germline and somatic SVs, paired normal and tumor samples were compared as described previously <abbrgrp><abbr bid="B9">9</abbr></abbrgrp>. Further filtering of known germline SVs and PCR validation are described in Note 1 in Additional file <supplr sid="S1">1</supplr>.</p>
</sec>
<sec><st><p>Cancer genome assembly</p></st>
<p>Contig assembly, scaffolding and gap-filling of the Illumina sequencing data were done using the assembler SOAPdenovo <abbrgrp><abbr bid="B28">28</abbr></abbrgrp>. DNA-PET reads were mapped to the SOAPdenovo assembly with Bowtie <abbrgrp><abbr bid="B29">29</abbr></abbrgrp> and the resulting linking information was used to produce larger scaffolds based on the optimal scaffolder Opera <abbrgrp><abbr bid="B30">30</abbr></abbrgrp>. Scaffolds and contigs were refined further with the gap-filling module in SOAPdenovo, employed for bridging scaffold gaps, where feasible. Using the SR reads alone, we obtained 12 kb scaffold N50 for both tumors. The DNA-PET reads allowed for improvement of assembly connectivity to a N50 of 65 kb and 41 kb for NGCII082 and NGCII092, respectively. Assemblies were compared to the reference human genome (UCSC hg18) using the MUMmer package <abbrgrp><abbr bid="B31">31</abbr></abbrgrp> and alignments longer than 1 kbp were used to identify deletions and insertions larger than 20 bp. Overall, 12,861 deletions and 143 insertions were found in NGCII082 and 9,274 deletions and 108 insertions in NGCII092 of which 3 events &gt; 2 kbp missed by DNA-PET analysis were identified in each sample. Fusion genes were validated and breakpoints were confirmed by using the gap-filling module in SOAPdenovo to bridge scaffolds constructed around the breakpoint. Sequences missing in the reference human genome were identified based on the criteria that they should be &gt; 500 bp long and have no match to the reference genome with &gt; 90% identity. Reads were mapped to the novel sequences using Bowtie to identify regions with no read coverage in the middle of a scaffold that could indicate a potential mis-assembly.</p>
</sec>
<sec><st><p>Analysis of microbial sequences</p></st>
<p>Reads with a putative microbial or viral origin were identified by mapping reads with no mapping to the human genome, to a database of complete bacterial and viral genomes in NCBI (using Bowtie <abbrgrp><abbr bid="B29">29</abbr></abbrgrp>). Matches were filtered for low-complexity sequences (more than three matches of any 5-mer) and the remaining reads were used to estimate the abundance for each species (pooling reads mapped to different strains of a species). Each species was checked for multiple distinct read matches to its genome (&gt; 4 distinct regions, where the genome was segmented in 1 kbp windows) and the presence of unique read matches (using the unique option in Bowtie). The small fraction of reads of putative bacterial origin in the matched blood samples (possibly reagent contamination) were used as control and read matches to the corresponding species were excluded in determining the tumor associated microbiome. Concentration of <it>H. pylori </it>cells in relation to tumor cells was estimated based on the assumption of uniform coverage of both cell types, where coverage = k &#215; Number of cells &#215; Size of genome, for a constant k and the populations are assumed to be clonal.</p>
</sec>
<sec><st><p>Functional annotation of SNVs and indels</p></st>
<p>For all samples, SNV and indel calls were annotated using the SeattleSeq server <abbrgrp><abbr bid="B32">32</abbr></abbrgrp> and SIFT <abbrgrp><abbr bid="B33">33</abbr></abbrgrp>, respectively. Pathway analyses were performed based on non-synonymous SNVs and indels using the Pathway Interaction Database <abbrgrp><abbr bid="B34">34</abbr></abbrgrp> (sample pfg005T from Wang <it>et al. </it><abbrgrp><abbr bid="B7">7</abbr></abbrgrp> was excluded as it only had four somatic mutations).</p>
</sec>
<sec><st><p>Data access</p></st>
<p>Sequencing data for this publication have been deposited in NCBI's Gene Expression Omnibus <abbrgrp><abbr bid="B35">35</abbr></abbrgrp> and is accessible through GEO Series accession number GSE30833.</p>
</sec>
</sec>
<sec><st><p>Abbreviations</p></st>
<p>CIN: chromosomal instability; CNV: copy-number variation; DNA-PET: DNA paired-end tag; GC: gastric cancer; MSI: microsatellite instability; OR: odds ratio; RNS: reactive nitrogen species; ROS: reactive oxygen species; SNV: single nucleotide variation; SR: short read; SV: structural variation; WGS: whole-genome sequencing.</p>
</sec>
<sec><st><p>Authors' contributions</p></st>
<p>YR and KGY initiated the study. NN, DB, AMH, PBOT and YR designed the experiments. JR, MT, FZ, JBYS, RS and KGY obtained ethical approval, patient information and patient samples and commented on clinical relevance of genomic findings. ASMT, ZZ and AH constructed genome-wide sequencing libraries (SR and DNA-PET). NN, DB and AMH coordinated the data analysis. DB and NN did the mutation analysis with assistance from LV and AS. PEJ did the expression analysis. AMH, FY, WHL, PNA, XYW and CCK did the copy number and structural variation analysis with guidance from WKS, GB and MLH. FY, ASMT and YYS performed validation of structural variations and point mutations and screened for recurrent mutations and indels. YYS performed quantitative PCR. SG and DB did the assembly analysis with guidance from NN. DB analyzed the impact of mutations with guidance from AMH, NN, PBOT and KVD. MW, SYC, BP and RS performed microsatellite instability analysis for the cohort of patient samples that were screened for recurrent mutations. XR coordinated Illumina and SOLiD sequencing of the WGS samples. ZJZ, IC, CKO, ND, BTT, SR and PBOT coordinated and executed the exome sequencing and mapping analysis of the data. NN, DB, AMH and PBOT wrote the manuscript. All authors read and approved the final manuscript.</p>
</sec>
</bdy>
<bm>
<ack>
<sec><st><p>Acknowledgements</p></st>
<p>This work was supported by the Agency for Science Technology and Research (A*STAR), Singapore, the Translational Clinical Research (TCR) Flagship Programme - 'The Singapore Gastric Cancer Consortium- Improving Outcomes for Our Patients', the National Cancer Institute (USA) NCI: 5 R33 CA126996-02 (Pair-end-ditag technologies for the complete annotation of fusion genes) and funds from the Lee Foundation and the National Cancer Centre Research Foundation. Additional support was provided by the Genome Institute of Singapore internal research funds from the National Medical Research Council of Singapore and Biomedical Research Council (BMRC) of A*STAR. We thank Hwee Meng Low and Yeen Hui Choy for help on PCR validation, Herv&#233; Thoreau for managing of the sequencing platforms, See Ting Leong, Say Chuan Neo, and Poh Sum D Choi for SOLiD sequencing and Chin Thing Ong, Adeline Lai San Chew, Kian Chew Lim, Yen Ling Yee, Thompson Poh, Raquel Peh, and Crystal Toh for Illumina sequencing.</p>
</sec>
</ack>
<refgrp><bibl id="B1"><title><p>Risk factors in gastric cancer.</p></title><aug><au><snm>Compare</snm><fnm>D</fnm></au><au><snm>Rocco</snm><fnm>A</fnm></au><au><snm>Nardone</snm><fnm>G</fnm></au></aug><source>Eur Rev Med Pharmacol Sci</source><pubdate>2010</pubdate><volume>14</volume><fpage>302</fpage><lpage>308</lpage><xrefbib><pubid idtype="pmpid">20496539</pubid></xrefbib></bibl><bibl id="B2"><title><p>A small-cell lung cancer genome with complex signatures of tobacco exposure.</p></title><aug><au><snm>Pleasance</snm><fnm>ED</fnm></au><au><snm>Stephens</snm><fnm>PJ</fnm></au><au><snm>O'Meara</snm><fnm>S</fnm></au><au><snm>McBride</snm><fnm>DJ</fnm></au><au><snm>Meynert</snm><fnm>A</fnm></au><au><snm>Jones</snm><fnm>D</fnm></au><au><snm>Lin</snm><fnm>ML</fnm></au><au><snm>Beare</snm><fnm>D</fnm></au><au><snm>Lau</snm><fnm>KW</fnm></au><au><snm>Greenman</snm><fnm>C</fnm></au><au><snm>Varela</snm><fnm>I</fnm></au><au><snm>Nik-Zainal</snm><fnm>S</fnm></au><au><snm>Davies</snm><fnm>HR</fnm></au><au><snm>Ordonez</snm><fnm>GR</fnm></au><au><snm>Mudie</snm><fnm>LJ</fnm></au><au><snm>Latimer</snm><fnm>C</fnm></au><au><snm>Edkins</snm><fnm>S</fnm></au><au><snm>Stebbings</snm><fnm>L</fnm></au><au><snm>Chen</snm><fnm>L</fnm></au><au><snm>Jia</snm><fnm>M</fnm></au><au><snm>Leroy</snm><fnm>C</fnm></au><au><snm>Marshall</snm><fnm>J</fnm></au><au><snm>Menzies</snm><fnm>A</fnm></au><au><snm>Butler</snm><fnm>A</fnm></au><au><snm>Teague</snm><fnm>JW</fnm></au><au><snm>Mangion</snm><fnm>J</fnm></au><au><snm>Sun</snm><fnm>YA</fnm></au><au><snm>McLaughlin</snm><fnm>SF</fnm></au><au><snm>Peckham</snm><fnm>HE</fnm></au><au><snm>Tsung</snm><fnm>EF</fnm></au><etal/></aug><source>Nature</source><pubdate>2010</pubdate><volume>463</volume><fpage>184</fpage><lpage>190</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/nature08629</pubid><pubid idtype="pmcid">2880489</pubid><pubid idtype="pmpid" link="fulltext">20016488</pubid></pubidlist></xrefbib></bibl><bibl id="B3"><title><p>The mutation spectrum revealed by paired genome sequences from a lung cancer patient.</p></title><aug><au><snm>Lee</snm><fnm>W</fnm></au><au><snm>Jiang</snm><fnm>Z</fnm></au><au><snm>Liu</snm><fnm>J</fnm></au><au><snm>Haverty</snm><fnm>PM</fnm></au><au><snm>Guan</snm><fnm>Y</fnm></au><au><snm>Stinson</snm><fnm>J</fnm></au><au><snm>Yue</snm><fnm>P</fnm></au><au><snm>Zhang</snm><fnm>Y</fnm></au><au><snm>Pant</snm><fnm>KP</fnm></au><au><snm>Bhatt</snm><fnm>D</fnm></au><au><snm>Ha</snm><fnm>C</fnm></au><au><snm>Johnson</snm><fnm>S</fnm></au><au><snm>Kennemer</snm><fnm>MI</fnm></au><au><snm>Mohan</snm><fnm>S</fnm></au><au><snm>Nazarenko</snm><fnm>I</fnm></au><au><snm>Watanabe</snm><fnm>C</fnm></au><au><snm>Sparks</snm><fnm>AB</fnm></au><au><snm>Shames</snm><fnm>DS</fnm></au><au><snm>Gentleman</snm><fnm>R</fnm></au><au><snm>de Sauvage</snm><fnm>FJ</fnm></au><au><snm>Stern</snm><fnm>H</fnm></au><au><snm>Pandita</snm><fnm>A</fnm></au><au><snm>Ballinger</snm><fnm>DG</fnm></au><au><snm>Drmanac</snm><fnm>R</fnm></au><au><snm>Modrusan</snm><fnm>Z</fnm></au><au><snm>Seshagiri</snm><fnm>S</fnm></au><au><snm>Zhang</snm><fnm>Z</fnm></au></aug><source>Nature</source><pubdate>2010</pubdate><volume>465</volume><fpage>473</fpage><lpage>477</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/nature09004</pubid><pubid idtype="pmpid" link="fulltext">20505728</pubid></pubidlist></xrefbib></bibl><bibl id="B4"><title><p>A comprehensive catalogue of somatic mutations from a human cancer genome.</p></title><aug><au><snm>Pleasance</snm><fnm>ED</fnm></au><au><snm>Cheetham</snm><fnm>RK</fnm></au><au><snm>Stephens</snm><fnm>PJ</fnm></au><au><snm>McBride</snm><fnm>DJ</fnm></au><au><snm>Humphray</snm><fnm>SJ</fnm></au><au><snm>Greenman</snm><fnm>CD</fnm></au><au><snm>Varela</snm><fnm>I</fnm></au><au><snm>Lin</snm><fnm>ML</fnm></au><au><snm>Ordonez</snm><fnm>GR</fnm></au><au><snm>Bignell</snm><fnm>GR</fnm></au><au><snm>Ye</snm><fnm>K</fnm></au><au><snm>Alipaz</snm><fnm>J</fnm></au><au><snm>Bauer</snm><fnm>MJ</fnm></au><au><snm>Beare</snm><fnm>D</fnm></au><au><snm>Butler</snm><fnm>A</fnm></au><au><snm>Carter</snm><fnm>RJ</fnm></au><au><snm>Chen</snm><fnm>L</fnm></au><au><snm>Cox</snm><fnm>AJ</fnm></au><au><snm>Edkins</snm><fnm>S</fnm></au><au><snm>Kokko-Gonzales</snm><fnm>PI</fnm></au><au><snm>Gormley</snm><fnm>NA</fnm></au><au><snm>Grocock</snm><fnm>RJ</fnm></au><au><snm>Haudenschild</snm><fnm>CD</fnm></au><au><snm>Hims</snm><fnm>MM</fnm></au><au><snm>James</snm><fnm>T</fnm></au><au><snm>Jia</snm><fnm>M</fnm></au><au><snm>Kingsbury</snm><fnm>Z</fnm></au><au><snm>Leroy</snm><fnm>C</fnm></au><au><snm>Marshall</snm><fnm>J</fnm></au><au><snm>Menzies</snm><fnm>A</fnm></au><etal/></aug><source>Nature</source><pubdate>2010</pubdate><volume>463</volume><fpage>191</fpage><lpage>196</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/nature08658</pubid><pubid idtype="pmcid">3145108</pubid><pubid idtype="pmpid" link="fulltext">20016485</pubid></pubidlist></xrefbib></bibl><bibl id="B5"><title><p>Whole-genome sequencing identifies recurrent mutations in chronic lymphocytic leukaemia.</p></title><aug><au><snm>Puente</snm><fnm>XS</fnm></au><au><snm>Pinyol</snm><fnm>M</fnm></au><au><snm>Quesada</snm><fnm>V</fnm></au><au><snm>Conde</snm><fnm>L</fnm></au><au><snm>Ordonez</snm><fnm>GR</fnm></au><au><snm>Villamor</snm><fnm>N</fnm></au><au><snm>Escaramis</snm><fnm>G</fnm></au><au><snm>Jares</snm><fnm>P</fnm></au><au><snm>Bea</snm><fnm>S</fnm></au><au><snm>Gonzalez-Diaz</snm><fnm>M</fnm></au><au><snm>Bassaganyas</snm><fnm>L</fnm></au><au><snm>Baumann</snm><fnm>T</fnm></au><au><snm>Juan</snm><fnm>M</fnm></au><au><snm>Lopez-Guerra</snm><fnm>M</fnm></au><au><snm>Colomer</snm><fnm>D</fnm></au><au><snm>Tubio</snm><fnm>JM</fnm></au><au><snm>Lopez</snm><fnm>C</fnm></au><au><snm>Navarro</snm><fnm>A</fnm></au><au><snm>Tornador</snm><fnm>C</fnm></au><au><snm>Aymerich</snm><fnm>M</fnm></au><au><snm>Rozman</snm><fnm>M</fnm></au><au><snm>Hernandez</snm><fnm>JM</fnm></au><au><snm>Puente</snm><fnm>DA</fnm></au><au><snm>Freije</snm><fnm>JM</fnm></au><au><snm>Velasco</snm><fnm>G</fnm></au><au><snm>Gutierrez-Fernandez</snm><fnm>A</fnm></au><au><snm>Costa</snm><fnm>D</fnm></au><au><snm>Carrio</snm><fnm>A</fnm></au><au><snm>Guijarro</snm><fnm>S</fnm></au><au><snm>Enjuanes</snm><fnm>A</fnm></au><etal/></aug><source>Nature</source><pubdate>2011</pubdate><volume>475</volume><fpage>101</fpage><lpage>105</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/nature10113</pubid><pubid idtype="pmcid">3322590</pubid><pubid idtype="pmpid" link="fulltext">21642962</pubid></pubidlist></xrefbib></bibl><bibl id="B6"><title><p>Exome sequencing of gastric adenocarcinoma identifies recurrent somatic mutations in cell adhesion and chromatin remodeling genes.</p></title><aug><au><snm>Zang</snm><fnm>ZJ</fnm></au><au><snm>Cutcutache</snm><fnm>I</fnm></au><au><snm>Poon</snm><fnm>SL</fnm></au><au><snm>Zhang</snm><fnm>SL</fnm></au><au><snm>McPherson</snm><fnm>JR</fnm></au><au><snm>Tao</snm><fnm>J</fnm></au><au><snm>Rajasegaran</snm><fnm>V</fnm></au><au><snm>Heng</snm><fnm>HL</fnm></au><au><snm>Deng</snm><fnm>N</fnm></au><au><snm>Gan</snm><fnm>A</fnm></au><au><snm>Lim</snm><fnm>KH</fnm></au><au><snm>Ong</snm><fnm>CK</fnm></au><au><snm>Huang</snm><fnm>D</fnm></au><au><snm>Chin</snm><fnm>SY</fnm></au><au><snm>Tan</snm><fnm>IB</fnm></au><au><snm>Ng</snm><fnm>CC</fnm></au><au><snm>Yu</snm><fnm>W</fnm></au><au><snm>Wu</snm><fnm>Y</fnm></au><au><snm>Lee</snm><fnm>M</fnm></au><au><snm>Wu</snm><fnm>J</fnm></au><au><snm>Poh</snm><fnm>D</fnm></au><au><snm>Wan</snm><fnm>WK</fnm></au><au><snm>Rha</snm><fnm>SY</fnm></au><au><snm>So</snm><fnm>J</fnm></au><au><snm>Salto-Tellez</snm><fnm>M</fnm></au><au><snm>Yeoh</snm><fnm>KG</fnm></au><au><snm>Wong</snm><fnm>WK</fnm></au><au><snm>Zhu</snm><fnm>YJ</fnm></au><au><snm>Futreal</snm><fnm>PA</fnm></au><au><snm>Pang</snm><fnm>B</fnm></au><etal/></aug><source>Nat Genet</source><pubdate>2012</pubdate><volume>44</volume><fpage>570</fpage><lpage>457</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/ng.2246</pubid><pubid idtype="pmpid" link="fulltext">22484628</pubid></pubidlist></xrefbib></bibl><bibl id="B7"><title><p>Exome sequencing identifies frequent mutation of ARID1A in molecular subtypes of gastric cancer.</p></title><aug><au><snm>Wang</snm><fnm>K</fnm></au><au><snm>Kan</snm><fnm>J</fnm></au><au><snm>Yuen</snm><fnm>ST</fnm></au><au><snm>Shi</snm><fnm>ST</fnm></au><au><snm>Chu</snm><fnm>KM</fnm></au><au><snm>Law</snm><fnm>S</fnm></au><au><snm>Chan</snm><fnm>TL</fnm></au><au><snm>Kan</snm><fnm>Z</fnm></au><au><snm>Chan</snm><fnm>AS</fnm></au><au><snm>Tsui</snm><fnm>WY</fnm></au><au><snm>Lee</snm><fnm>SP</fnm></au><au><snm>Ho</snm><fnm>SL</fnm></au><au><snm>Chan</snm><fnm>AK</fnm></au><au><snm>Cheng</snm><fnm>GH</fnm></au><au><snm>Roberts</snm><fnm>PC</fnm></au><au><snm>Rejto</snm><fnm>PA</fnm></au><au><snm>Gibson</snm><fnm>NW</fnm></au><au><snm>Pocalyko</snm><fnm>DJ</fnm></au><au><snm>Mao</snm><fnm>M</fnm></au><au><snm>Xu</snm><fnm>J</fnm></au><au><snm>Leung</snm><fnm>SY</fnm></au></aug><source>Nat Genet</source><pubdate>2011</pubdate><volume>43</volume><fpage>1219</fpage><lpage>1223</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/ng.982</pubid><pubid idtype="pmpid" link="fulltext">22037554</pubid></pubidlist></xrefbib></bibl><bibl id="B8"><title><p>Next-generation DNA sequencing of paired-end tags (PET) for transcriptome and genome analyses.</p></title><aug><au><snm>Fullwood</snm><fnm>MJ</fnm></au><au><snm>Wei</snm><fnm>CL</fnm></au><au><snm>Liu</snm><fnm>ET</fnm></au><au><snm>Ruan</snm><fnm>Y</fnm></au></aug><source>Genome Res</source><pubdate>2009</pubdate><volume>19</volume><fpage>521</fpage><lpage>532</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1101/gr.074906.107</pubid><pubid idtype="pmpid" link="fulltext">19339662</pubid></pubidlist></xrefbib></bibl><bibl id="B9"><title><p>Comprehensive long-span paired-end-tag mapping reveals characteristic patterns of structural variations in epithelial cancer genomes.</p></title><aug><au><snm>Hillmer</snm><fnm>AM</fnm></au><au><snm>Yao</snm><fnm>F</fnm></au><au><snm>Inaki</snm><fnm>K</fnm></au><au><snm>Lee</snm><fnm>WH</fnm></au><au><snm>Ariyaratne</snm><fnm>PN</fnm></au><au><snm>Teo</snm><fnm>AS</fnm></au><au><snm>Woo</snm><fnm>XY</fnm></au><au><snm>Zhang</snm><fnm>Z</fnm></au><au><snm>Zhao</snm><fnm>H</fnm></au><au><snm>Ukil</snm><fnm>L</fnm></au><au><snm>Chen</snm><fnm>JP</fnm></au><au><snm>Zhu</snm><fnm>F</fnm></au><au><snm>So</snm><fnm>JB</fnm></au><au><snm>Salto-Tellez</snm><fnm>M</fnm></au><au><snm>Poh</snm><fnm>WT</fnm></au><au><snm>Zawack</snm><fnm>KF</fnm></au><au><snm>Nagarajan</snm><fnm>N</fnm></au><au><snm>Gao</snm><fnm>S</fnm></au><au><snm>Li</snm><fnm>G</fnm></au><au><snm>Kumar</snm><fnm>V</fnm></au><au><snm>Lim</snm><fnm>HP</fnm></au><au><snm>Sia</snm><fnm>YY</fnm></au><au><snm>Chan</snm><fnm>CS</fnm></au><au><snm>Leong</snm><fnm>ST</fnm></au><au><snm>Neo</snm><fnm>SC</fnm></au><au><snm>Choi</snm><fnm>PS</fnm></au><au><snm>Thoreau</snm><fnm>H</fnm></au><au><snm>Tan</snm><fnm>PB</fnm></au><au><snm>Shahab</snm><fnm>A</fnm></au><au><snm>Ruan</snm><fnm>X</fnm></au><etal/></aug><source>Genome Res</source><pubdate>2011</pubdate><volume>21</volume><fpage>665</fpage><lpage>675</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1101/gr.113555.110</pubid><pubid idtype="pmcid">3083083</pubid><pubid idtype="pmpid" link="fulltext">21467267</pubid></pubidlist></xrefbib></bibl><bibl id="B10"><title><p>A comprehensive survey of genomic alterations in gastric cancer reveals systematic patterns of molecular exclusivity and co-occurrence among distinct therapeutic targets.</p></title><aug><au><snm>Deng</snm><fnm>N</fnm></au><au><snm>Goh</snm><fnm>LK</fnm></au><au><snm>Wang</snm><fnm>H</fnm></au><au><snm>Das</snm><fnm>K</fnm></au><au><snm>Tao</snm><fnm>J</fnm></au><au><snm>Tan</snm><fnm>IB</fnm></au><au><snm>Zhang</snm><fnm>S</fnm></au><au><snm>Lee</snm><fnm>M</fnm></au><au><snm>Wu</snm><fnm>J</fnm></au><au><snm>Lim</snm><fnm>KH</fnm></au><au><snm>Lei</snm><fnm>Z</fnm></au><au><snm>Goh</snm><fnm>G</fnm></au><au><snm>Lim</snm><fnm>QY</fnm></au><au><snm>Lay-Keng Tan</snm><fnm>A</fnm></au><au><snm>Sin Poh</snm><fnm>DY</fnm></au><au><snm>Riahi</snm><fnm>S</fnm></au><au><snm>Bell</snm><fnm>S</fnm></au><au><snm>Shi</snm><fnm>MM</fnm></au><au><snm>Linnartz</snm><fnm>R</fnm></au><au><snm>Zhu</snm><fnm>F</fnm></au><au><snm>Yeoh</snm><fnm>KG</fnm></au><au><snm>Toh</snm><fnm>HC</fnm></au><au><snm>Yong</snm><fnm>WP</fnm></au><au><snm>Cheong</snm><fnm>HC</fnm></au><au><snm>Rha</snm><fnm>SY</fnm></au><au><snm>Boussioutas</snm><fnm>A</fnm></au><au><snm>Grabsch</snm><fnm>H</fnm></au><au><snm>Rozen</snm><fnm>S</fnm></au><au><snm>Tan</snm><fnm>P</fnm></au></aug><source>Gut</source><pubdate>2012</pubdate><volume>61</volume><fpage>673</fpage><lpage>684</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1136/gutjnl-2011-301839</pubid><pubid idtype="pmcid">3322587</pubid><pubid idtype="pmpid" link="fulltext">22315472</pubid></pubidlist></xrefbib></bibl><bibl id="B11"><title><p>Mechanisms for human genomic rearrangements.</p></title><aug><au><snm>Gu</snm><fnm>W</fnm></au><au><snm>Zhang</snm><fnm>F</fnm></au><au><snm>Lupski</snm><fnm>JR</fnm></au></aug><source>Pathogenetics</source><pubdate>2008</pubdate><volume>1</volume><fpage>4</fpage><xrefbib><pubidlist><pubid idtype="doi">10.1186/1755-8417-1-4</pubid><pubid idtype="pmcid">2583991</pubid><pubid idtype="pmpid" link="fulltext">19014668</pubid></pubidlist></xrefbib></bibl><bibl id="B12"><title><p>Genetic instabilities in human cancers.</p></title><aug><au><snm>Lengauer</snm><fnm>C</fnm></au><au><snm>Kinzler</snm><fnm>KW</fnm></au><au><snm>Vogelstein</snm><fnm>B</fnm></au></aug><source>Nature</source><pubdate>1998</pubdate><volume>396</volume><fpage>643</fpage><lpage>649</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/25292</pubid><pubid idtype="pmpid" link="fulltext">9872311</pubid></pubidlist></xrefbib></bibl><bibl id="B13"><title><p>Do microsatellite instability profiles really differ between colorectal and endometrial tumors?</p></title><aug><au><snm>Ferreira</snm><fnm>AM</fnm></au><au><snm>Westers</snm><fnm>H</fnm></au><au><snm>Wu</snm><fnm>Y</fnm></au><au><snm>Niessen</snm><fnm>RC</fnm></au><au><snm>Olderode-Berends</snm><fnm>M</fnm></au><au><snm>van der Sluis</snm><fnm>T</fnm></au><au><snm>van der Zee</snm><fnm>AG</fnm></au><au><snm>Hollema</snm><fnm>H</fnm></au><au><snm>Kleibeuker</snm><fnm>JH</fnm></au><au><snm>Sijmons</snm><fnm>RH</fnm></au><au><snm>Hofstra</snm><fnm>RM</fnm></au></aug><source>Genes Chromosomes Cancer</source><pubdate>2009</pubdate><volume>48</volume><fpage>552</fpage><lpage>557</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1002/gcc.20664</pubid><pubid idtype="pmpid" link="fulltext">19373783</pubid></pubidlist></xrefbib></bibl><bibl id="B14"><title><p>8-nitroguanine, a product of nitrative DNA damage caused by reactive nitrogen species: formation, occurrence, and implications in inflammation and carcinogenesis.</p></title><aug><au><snm>Ohshima</snm><fnm>H</fnm></au><au><snm>Sawa</snm><fnm>T</fnm></au><au><snm>Akaike</snm><fnm>T</fnm></au></aug><source>Antioxid Redox Signal</source><pubdate>2006</pubdate><volume>8</volume><fpage>1033</fpage><lpage>1045</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1089/ars.2006.8.1033</pubid><pubid idtype="pmpid" link="fulltext">16771693</pubid></pubidlist></xrefbib></bibl><bibl id="B15"><title><p>Messenger RNA editing in mammals: new members of the APOBEC family seeking roles in the family business.</p></title><aug><au><snm>Wedekind</snm><fnm>JE</fnm></au><au><snm>Dance</snm><fnm>GS</fnm></au><au><snm>Sowden</snm><fnm>MP</fnm></au><au><snm>Smith</snm><fnm>HC</fnm></au></aug><source>Trends Genet</source><pubdate>2003</pubdate><volume>19</volume><fpage>207</fpage><lpage>216</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1016/S0168-9525(03)00054-4</pubid><pubid idtype="pmpid" link="fulltext">12683974</pubid></pubidlist></xrefbib></bibl><bibl id="B16"><title><p>Activation-induced cytidine deaminase targets DNA at sites of RNA polymerase II stalling by interaction with Spt5.</p></title><aug><au><snm>Pavri</snm><fnm>R</fnm></au><au><snm>Gazumyan</snm><fnm>A</fnm></au><au><snm>Jankovic</snm><fnm>M</fnm></au><au><snm>Di Virgilio</snm><fnm>M</fnm></au><au><snm>Klein</snm><fnm>I</fnm></au><au><snm>Ansarah-Sobrinho</snm><fnm>C</fnm></au><au><snm>Resch</snm><fnm>W</fnm></au><au><snm>Yamane</snm><fnm>A</fnm></au><au><snm>Reina San-Martin</snm><fnm>B</fnm></au><au><snm>Barreto</snm><fnm>V</fnm></au><au><snm>Nieland</snm><fnm>TJ</fnm></au><au><snm>Root</snm><fnm>DE</fnm></au><au><snm>Casellas</snm><fnm>R</fnm></au><au><snm>Nussenzweig</snm><fnm>MC</fnm></au></aug><source>Cell</source><pubdate>2010</pubdate><volume>143</volume><fpage>122</fpage><lpage>133</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1016/j.cell.2010.09.017</pubid><pubid idtype="pmcid">2993080</pubid><pubid idtype="pmpid" link="fulltext">20887897</pubid></pubidlist></xrefbib></bibl><bibl id="B17"><title><p>Helicobacter pylori-induced activation-induced cytidine deaminase expression and carcinogenesis.</p></title><aug><au><snm>Marusawa</snm><fnm>H</fnm></au><au><snm>Chiba</snm><fnm>T</fnm></au></aug><source>Curr Opin Immunol</source><pubdate>2010</pubdate><volume>22</volume><fpage>442</fpage><lpage>447</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1016/j.coi.2010.06.001</pubid><pubid idtype="pmpid" link="fulltext">20667704</pubid></pubidlist></xrefbib></bibl><bibl id="B18"><title><p>RHO-GTPases and cancer.</p></title><aug><au><snm>Sahai</snm><fnm>E</fnm></au><au><snm>Marshall</snm><fnm>CJ</fnm></au></aug><source>Nat Rev Cancer</source><pubdate>2002</pubdate><volume>2</volume><fpage>133</fpage><lpage>142</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1038/nrc725</pubid><pubid idtype="pmpid" link="fulltext">12635176</pubid></pubidlist></xrefbib></bibl><bibl id="B19"><title><p>Expression of seven main Rho family members in gastric carcinoma.</p></title><aug><au><snm>Pan</snm><fnm>Y</fnm></au><au><snm>Bi</snm><fnm>F</fnm></au><au><snm>Liu</snm><fnm>N</fnm></au><au><snm>Xue</snm><fnm>Y</fnm></au><au><snm>Yao</snm><fnm>X</fnm></au><au><snm>Zheng</snm><fnm>Y</fnm></au><au><snm>Fan</snm><fnm>D</fnm></au></aug><source>Biochem Biophys Res Commun</source><pubdate>2004</pubdate><volume>315</volume><fpage>686</fpage><lpage>691</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1016/j.bbrc.2004.01.108</pubid><pubid idtype="pmpid" link="fulltext">14975755</pubid></pubidlist></xrefbib></bibl><bibl id="B20"><title><p>p53 suppresses structural chromosome instability after mitotic arrest in human cells.</p></title><aug><au><snm>Dalton</snm><fnm>WB</fnm></au><au><snm>Yu</snm><fnm>B</fnm></au><au><snm>Yang</snm><fnm>VW</fnm></au></aug><source>Oncogene</source><volume>29</volume><fpage>1929</fpage><lpage>1940</lpage></bibl><bibl id="B21"><title><p>Screening for aneuploidy in first and second trimesters: is there an optimal paradigm?</p></title><aug><au><snm>Breathnach</snm><fnm>FM</fnm></au><au><snm>Malone</snm><fnm>FD</fnm></au></aug><source>Curr Opin Obstet Gynecol</source><pubdate>2007</pubdate><volume>19</volume><fpage>176</fpage><lpage>182</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1097/GCO.0b013e3280895e00</pubid><pubid idtype="pmpid" link="fulltext">17353686</pubid></pubidlist></xrefbib></bibl><bibl id="B22"><title><p>SelTarbase, a database of human mononucleotide-microsatellite mutations and their potential impact to tumorigenesis and immunology.</p></title><aug><au><snm>Woerner</snm><fnm>SM</fnm></au><au><snm>Yuan</snm><fnm>YP</fnm></au><au><snm>Benner</snm><fnm>A</fnm></au><au><snm>Korff</snm><fnm>S</fnm></au><au><snm>von Knebel Doeberitz</snm><fnm>M</fnm></au><au><snm>Bork</snm><fnm>P</fnm></au></aug><source>Nucleic Acids Res</source><pubdate>2010</pubdate><volume>38</volume><fpage>D682</fpage><lpage>689</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1093/nar/gkp839</pubid><pubid idtype="pmcid">2808963</pubid><pubid idtype="pmpid" link="fulltext">19820113</pubid></pubidlist></xrefbib></bibl><bibl id="B23"><title><p>Evidence of selection for clones having genetic inactivation of the activin A type II receptor (ACVR2) gene in gastrointestinal cancers.</p></title><aug><au><snm>Hempen</snm><fnm>PM</fnm></au><au><snm>Zhang</snm><fnm>L</fnm></au><au><snm>Bansal</snm><fnm>RK</fnm></au><au><snm>Iacobuzio-Donahue</snm><fnm>CA</fnm></au><au><snm>Murphy</snm><fnm>KM</fnm></au><au><snm>Maitra</snm><fnm>A</fnm></au><au><snm>Vogelstein</snm><fnm>B</fnm></au><au><snm>Whitehead</snm><fnm>RH</fnm></au><au><snm>Markowitz</snm><fnm>SD</fnm></au><au><snm>Willson</snm><fnm>JK</fnm></au><au><snm>Yeo</snm><fnm>CJ</fnm></au><au><snm>Hruban</snm><fnm>RH</fnm></au><au><snm>Kern</snm><fnm>SE</fnm></au></aug><source>Cancer Res</source><pubdate>2003</pubdate><volume>63</volume><fpage>994</fpage><lpage>999</lpage><xrefbib><pubid idtype="pmpid" link="fulltext">12615714</pubid></xrefbib></bibl><bibl id="B24"><title><p>Loss of activin receptor type 2 protein expression in microsatellite unstable colon cancers.</p></title><aug><au><snm>Jung</snm><fnm>B</fnm></au><au><snm>Doctolero</snm><fnm>RT</fnm></au><au><snm>Tajima</snm><fnm>A</fnm></au><au><snm>Nguyen</snm><fnm>AK</fnm></au><au><snm>Keku</snm><fnm>T</fnm></au><au><snm>Sandler</snm><fnm>RS</fnm></au><au><snm>Carethers</snm><fnm>JM</fnm></au></aug><source>Gastroenterology</source><pubdate>2004</pubdate><volume>126</volume><fpage>654</fpage><lpage>659</lpage><xrefbib><pubid idtype="pmpid" link="fulltext">14988818</pubid></xrefbib></bibl><bibl id="B25"><title><p>The Sequence Alignment/Map format and SAMtools.</p></title><aug><au><snm>Li</snm><fnm>H</fnm></au><au><snm>Handsaker</snm><fnm>B</fnm></au><au><snm>Wysoker</snm><fnm>A</fnm></au><au><snm>Fennell</snm><fnm>T</fnm></au><au><snm>Ruan</snm><fnm>J</fnm></au><au><snm>Homer</snm><fnm>N</fnm></au><au><snm>Marth</snm><fnm>G</fnm></au><au><snm>Abecasis</snm><fnm>G</fnm></au><au><snm>Durbin</snm><fnm>R</fnm></au></aug><source>Bioinformatics</source><pubdate>2009</pubdate><volume>25</volume><fpage>2078</fpage><lpage>2079</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1093/bioinformatics/btp352</pubid><pubid idtype="pmcid">2723002</pubid><pubid idtype="pmpid" link="fulltext">19505943</pubid></pubidlist></xrefbib></bibl><bibl id="B26"><title><p>Fast and accurate long-read alignment with Burrows-Wheeler transform.</p></title><aug><au><snm>Li</snm><fnm>H</fnm></au><au><snm>Durbin</snm><fnm>R</fnm></au></aug><source>Bioinformatics</source><pubdate>2010</pubdate><volume>26</volume><fpage>589</fpage><lpage>595</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1093/bioinformatics/btp698</pubid><pubid idtype="pmcid">2828108</pubid><pubid idtype="pmpid" link="fulltext">20080505</pubid></pubidlist></xrefbib></bibl><bibl id="B27"><title><p>Sensitive and accurate detection of copy number variants using read depth of coverage.</p></title><aug><au><snm>Yoon</snm><fnm>S</fnm></au><au><snm>Xuan</snm><fnm>Z</fnm></au><au><snm>Makarov</snm><fnm>V</fnm></au><au><snm>Ye</snm><fnm>K</fnm></au><au><snm>Sebat</snm><fnm>J</fnm></au></aug><source>Genome Res</source><pubdate>2009</pubdate><volume>19</volume><fpage>1586</fpage><lpage>1592</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1101/gr.092981.109</pubid><pubid idtype="pmcid">2752127</pubid><pubid idtype="pmpid" link="fulltext">19657104</pubid></pubidlist></xrefbib></bibl><bibl id="B28"><title><p>De novo assembly of human genomes with massively parallel short read sequencing.</p></title><aug><au><snm>Li</snm><fnm>R</fnm></au><au><snm>Zhu</snm><fnm>H</fnm></au><au><snm>Ruan</snm><fnm>J</fnm></au><au><snm>Qian</snm><fnm>W</fnm></au><au><snm>Fang</snm><fnm>X</fnm></au><au><snm>Shi</snm><fnm>Z</fnm></au><au><snm>Li</snm><fnm>Y</fnm></au><au><snm>Li</snm><fnm>S</fnm></au><au><snm>Shan</snm><fnm>G</fnm></au><au><snm>Kristiansen</snm><fnm>K</fnm></au><au><snm>Yang</snm><fnm>H</fnm></au><au><snm>Wang</snm><fnm>J</fnm></au></aug><source>Genome Res</source><pubdate>2010</pubdate><volume>20</volume><fpage>265</fpage><lpage>272</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1101/gr.097261.109</pubid><pubid idtype="pmcid">2813482</pubid><pubid idtype="pmpid" link="fulltext">20019144</pubid></pubidlist></xrefbib></bibl><bibl id="B29"><title><p>Ultrafast and memory-efficient alignment of short DNA sequences to the human genome.</p></title><aug><au><snm>Langmead</snm><fnm>B</fnm></au><au><snm>Trapnell</snm><fnm>C</fnm></au><au><snm>Pop</snm><fnm>M</fnm></au><au><snm>Salzberg</snm><fnm>SL</fnm></au></aug><source>Genome Biol</source><pubdate>2009</pubdate><volume>10</volume><fpage>R25</fpage><xrefbib><pubidlist><pubid idtype="doi">10.1186/gb-2009-10-3-r25</pubid><pubid idtype="pmcid">2690996</pubid><pubid idtype="pmpid" link="fulltext">19261174</pubid></pubidlist></xrefbib></bibl><bibl id="B30"><title><p>Opera: reconstructing optimal genomic scaffolds with high-throughput paired-end sequences.</p></title><aug><au><snm>Gao</snm><fnm>S</fnm></au><au><snm>Nagarajan</snm><fnm>N</fnm></au><au><snm>Sung</snm><fnm>WK</fnm></au></aug><source>Res Comput Mol Biol</source><pubdate>2011</pubdate><volume>6577</volume><fpage>437</fpage><lpage>451</lpage><xrefbib><pubid idtype="doi">10.1007/978-3-642-20036-6_40</pubid></xrefbib></bibl><bibl id="B31"><title><p>Versatile and open software for comparing large genomes.</p></title><aug><au><snm>Kurtz</snm><fnm>S</fnm></au><au><snm>Phillippy</snm><fnm>A</fnm></au><au><snm>Delcher</snm><fnm>AL</fnm></au><au><snm>Smoot</snm><fnm>M</fnm></au><au><snm>Shumway</snm><fnm>M</fnm></au><au><snm>Antonescu</snm><fnm>C</fnm></au><au><snm>Salzberg</snm><fnm>SL</fnm></au></aug><source>Genome Biol</source><pubdate>2004</pubdate><volume>5</volume><fpage>R12</fpage><xrefbib><pubidlist><pubid idtype="doi">10.1186/gb-2004-5-2-r12</pubid><pubid idtype="pmcid">395750</pubid><pubid idtype="pmpid" link="fulltext">14759262</pubid></pubidlist></xrefbib></bibl><bibl id="B32"><title><p>SeattleSeq.</p></title><url>http://gvs.gs.washington.edu/SeattleSeqAnnotation/</url></bibl><bibl id="B33"><title><p>SIFT: Predicting amino acid changes that affect protein function.</p></title><aug><au><snm>Ng</snm><fnm>PC</fnm></au><au><snm>Henikoff</snm><fnm>S</fnm></au></aug><source>Nucleic Acids Res</source><pubdate>2003</pubdate><volume>31</volume><fpage>3812</fpage><lpage>3814</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1093/nar/gkg509</pubid><pubid idtype="pmcid">168916</pubid><pubid idtype="pmpid" link="fulltext">12824425</pubid></pubidlist></xrefbib></bibl><bibl id="B34"><title><p>Pathway Interaction Database.</p></title><url>http://pid.nci.nih.gov</url></bibl><bibl id="B35"><title><p>Gene Expression Omnibus: NCBI gene expression and hybridization array data repository.</p></title><aug><au><snm>Edgar</snm><fnm>R</fnm></au><au><snm>Domrachev</snm><fnm>M</fnm></au><au><snm>Lash</snm><fnm>AE</fnm></au></aug><source>Nucleic Acids Res</source><pubdate>2002</pubdate><volume>30</volume><fpage>207</fpage><lpage>210</lpage><xrefbib><pubidlist><pubid idtype="doi">10.1093/nar/30.1.207</pubid><pubid idtype="pmcid">99122</pubid><pubid idtype="pmpid" link="fulltext">11752295</pubid></pubidlist></xrefbib></bibl></refgrp>
</bm>
</art>