<?xml version='1.0'?>
<!DOCTYPE art SYSTEM 'http://www.biomedcentral.com/xml/article.dtd'>
<art>
   <ui>gb-2008-9-9-r139</ui>
   <ji>GBJ</ji>
   <fm>
      <dochead>Method</dochead>
      <bibl>
         <title>
            <p>Systematic bioinformatic analysis of expression levels of 17,330 human genes across 9,783 samples from 175 types of healthy and pathological tissues</p>
         </title>
         <aug>
            <au id="A1" ce="yes">
               <snm>Kilpinen</snm>
               <fnm>Sami</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <email>sami.k.kilpinen@helsinki.fi</email>
            </au>
            <au id="A2" ce="yes">
               <snm>Autio</snm>
               <fnm>Reija</fnm>
               <insr iid="I3"/>
               <email>reija.autio@tut.fi</email>
            </au>
            <au id="A3">
               <snm>Ojala</snm>
               <fnm>Kalle</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <email>kalle.ojala@helsinki.fi</email>
            </au>
            <au id="A4">
               <snm>Iljin</snm>
               <fnm>Kristiina</fnm>
               <insr iid="I1"/>
               <email>kristiina.iljin@vtt.fi</email>
            </au>
            <au id="A5">
               <snm>Bucher</snm>
               <fnm>Elmar</fnm>
               <insr iid="I1"/>
               <email>elmar.bucher@vtt.fi</email>
            </au>
            <au id="A6">
               <snm>Sara</snm>
               <fnm>Henri</fnm>
               <insr iid="I1"/>
               <email>ext-henri.sara@vtt.fi</email>
            </au>
            <au id="A7">
               <snm>Pisto</snm>
               <fnm>Tommi</fnm>
               <insr iid="I1"/>
               <email>tommi.pisto@vtt.fi</email>
            </au>
            <au id="A8">
               <snm>Saarela</snm>
               <fnm>Matti</fnm>
               <insr iid="I3"/>
               <email>matti.saarela@tut.fi</email>
            </au>
            <au id="A9">
               <snm>Skotheim</snm>
               <mi>I</mi>
               <fnm>Rolf</fnm>
               <insr iid="I1"/>
               <insr iid="I4"/>
               <email>Rolf.I.Skotheim@rr-research.no</email>
            </au>
            <au id="A10">
               <snm>Bj&#246;rkman</snm>
               <fnm>Mari</fnm>
               <insr iid="I1"/>
               <email>Ext-Mari.Bjorkman@vtt.fi</email>
            </au>
            <au id="A11">
               <snm>Mpindi</snm>
               <fnm>John-Patrick</fnm>
               <insr iid="I1"/>
               <email>Ext-John.Mpindi@vtt.fi</email>
            </au>
            <au id="A12">
               <snm>Haapa-Paananen</snm>
               <fnm>Saija</fnm>
               <insr iid="I1"/>
               <email>Saija.Haapa-Paananen@vtt.fi</email>
            </au>
            <au id="A13">
               <snm>Vainio</snm>
               <fnm>Paula</fnm>
               <insr iid="I1"/>
               <email>Paula.Vainio@vtt.fi</email>
            </au>
            <au id="A14">
               <snm>Edgren</snm>
               <fnm>Henrik</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <email>henrik.edgren@helsinki.fi</email>
            </au>
            <au id="A15">
               <snm>Wolf</snm>
               <fnm>Maija</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <email>maija.wolf@helsinki.fi</email>
            </au>
            <au id="A16">
               <snm>Astola</snm>
               <fnm>Jaakko</fnm>
               <insr iid="I3"/>
               <email>jaakko.astola@tut.fi</email>
            </au>
            <au id="A17">
               <snm>Nees</snm>
               <fnm>Matthias</fnm>
               <insr iid="I1"/>
               <email>matthias.nees@vtt.fi</email>
            </au>
            <au id="A18">
               <snm>Hautaniemi</snm>
               <fnm>Sampsa</fnm>
               <insr iid="I5"/>
               <email>sampsa.hautaniemi@helsinki.fi</email>
            </au>
            <au id="A19" ca="yes">
               <snm>Kallioniemi</snm>
               <fnm>Olli</fnm>
               <insr iid="I1"/>
               <insr iid="I2"/>
               <email>olli.kallioniemi@vtt.fi</email>
            </au>
         </aug>
         <insg>
            <ins id="I1">
               <p>Medical Biotechnology, VTT Technical Research Centre and University of Turku, It&#228;inen pitk&#228;katu 4C, Turku, Finland</p>
            </ins>
            <ins id="I2">
               <p>Institute for Molecular Medicine Finland (FIMM), University of Helsinki, Tukholmankatu 8, Helsinki, Finland</p>
            </ins>
            <ins id="I3">
               <p>Department of Signal Processing, Tampere University of Technology, Korkeakoulunkatu 1, Tampere, Finland</p>
            </ins>
            <ins id="I4">
               <p>Department of Cancer Prevention, Institute for Cancer Research, Rikshospitalet-Radiumhospitalet Medical Centre, Oslo, NO-0310, Norway</p>
            </ins>
            <ins id="I5">
               <p>Computational Systems Biology Laboratory, Institute of Biomedicine and Genome-Scale Biology Research Program, University of Helsinki, Haartmaninkatu 8, Finland</p>
            </ins>
         </insg>
         <source>Genome Biology</source>
         <issn>1465-6906</issn>
         <pubdate>2008</pubdate>
         <volume>9</volume>
         <issue>9</issue>
         <fpage>R139</fpage>
         <url>http://genomebiology.com/2008/9/9/R139</url>
         <xrefbib>
            <pubidlist>
               <pubid idtype="pmpid">18803840</pubid>
               <pubid idtype="doi">10.1186/gb-2008-9-9-r139</pubid>
            </pubidlist>
         </xrefbib>
      </bibl>
      <history>
         <rec>
            <date>
               <day>15</day>
               <month>5</month>
               <year>2008</year>
            </date>
         </rec>
         <revrec>
            <date>
               <day>7</day>
               <month>8</month>
               <year>2008</year>
            </date>
         </revrec>
         <acc>
            <date>
               <day>19</day>
               <month>9</month>
               <year>2008</year>
            </date>
         </acc>
         <pub>
            <date>
               <day>19</day>
               <month>09</month>
               <year>2008</year>
            </date>
         </pub>
      </history>
      <cpyrt>
         <year>2008</year>
         <collab>Kilpinen et al; licensee BioMed Central Ltd.</collab>
         <note>This is an open access article distributed under the terms of the Creative Commons Attribution License (<url>http://creativecommons.org/licenses/by/2.0</url>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</note>
      </cpyrt>
      <shorttitle>
         <p>GeneSapiens</p>
      </shorttitle>
      <shortabs>
         <p>A method for the comparison of mRNA expression levels of most human genes across gene expression array experiments, and a database of the results, are presented.</p>
      </shortabs>
      <abs>
         <sec>
            <st>
               <p>Abstract</p>
            </st>
            <p>Our knowledge on tissue- and disease-specific functions of human genes is rather limited and highly context-specific. Here, we have developed a method for the comparison of mRNA expression levels of most human genes across 9,783 Affymetrix gene expression array experiments representing 43 normal human tissue types, 68 cancer types, and 64 other diseases. This database of gene expression patterns in normal human tissues and pathological conditions covers 113 million datapoints and is available from the GeneSapiens website.</p>
         </sec>
      </abs>
   </fm>
   <meta>
      <classifications>
         <classification type="BMC" subtype="man_spc_id" id="30010012">Medicine</classification>
         <classification type="BMC" subtype="man_spc_id" id="30010003">Cancer</classification>
         <classification type="BMC" subtype="man_spc_id" id="30010010">Genome studies</classification>
         <classification type="BMC" subtype="man_spc_id" id="30010002">Bioinformatics</classification>
      </classifications>
   </meta>
   <bdy>
      <sec>
         <st>
            <p>Background</p>
         </st>
         <p>A fundamental challenge in the post-genome era is the identification of the context-specific functions of human genes across healthy and disease tissues. Thousands of gene expression microarray measurements are performed each year by the scientific community and many of the data are made publicly available. In order to make use of this resource, integration of large collections of gene expression data from different tissues and microarray platforms is required. Available datasets, however, are often discordant and challenging to integrate due to the variety of the technologies used. Nevertheless, meta-analyses have already been shown to facilitate the analysis of gene expression across healthy and disease states <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr><abbr bid="B3">3</abbr></abbrgrp>. Due to the use of various microarray platforms in studies, the multiple datasets are typically analyzed separately <abbrgrp><abbr bid="B4">4</abbr><abbr bid="B5">5</abbr><abbr bid="B6">6</abbr><abbr bid="B7">7</abbr><abbr bid="B8">8</abbr><abbr bid="B9">9</abbr></abbrgrp>, for instance, focusing on cancer-normal comparisons within an organ type. Other studies have looked for systematic co-expression patterns between genes across multiple datasets in order to predict functions of genes <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B3">3</abbr><abbr bid="B10">10</abbr><abbr bid="B11">11</abbr><abbr bid="B12">12</abbr><abbr bid="B13">13</abbr><abbr bid="B14">14</abbr><abbr bid="B15">15</abbr></abbrgrp>. While this is useful for the understanding of common shared functions of genes across different organs, highly tissue- or disease-specific gene functions may be missed.</p>
         <p>Here, we describe the development of a database of <it>in silico </it>transcriptomics data that currently integrates 157 separate studies involving 9,783 human specimens, from 43 normal tissue types, 68 cancer types and 64 other disease types. The launch of the database was made possible by the development and validation of a novel method to normalize data arising from different Affymetrix microarray generations. The array data are linked with detailed clinical classifications and endpoints and are available through an interactive web interface designed for exploration by biologists and available at the GeneSapiens website <abbrgrp><abbr bid="B16">16</abbr></abbrgrp>. We demonstrate here the application of the GeneSapiens system to the tissue- and disease-specific expression profiles of human genes one at a time or as gene clusters.</p>
      </sec>
      <sec>
         <st>
            <p>Results and discussion</p>
         </st>
         <sec>
            <st>
               <p>Overview of the <it>in silico </it>transcriptomics data in the GeneSapiens system</p>
            </st>
            <p>The database was constructed from 9,783 CEL files of Affymetrix based gene expression measurements from normal and pathological human <it>in vivo </it>tissues and cells. We selected data from the five most widely used Affymetrix array generations (HG-U95A, HG-U95Av2, HG-U133A, HG-U133B, HG-U133 Plus 2), which were then normalized together. The detailed contents of the database are described in Additional data files 3 and 4. Each sample was systematically manually annotated with detailed information (when available) on sample collection procedures, demographic data, anatomic location, disease type, and clinicopathological details. These integrated data make it possible to generate expression profiles of any gene across 175 human tissue and disease types.</p>
            <p>Custom software was developed to construct the database from the collection of CEL files and manually curated annotations linked to each sample. The software was based upon a Perl wrapper calling several subprograms written in Perl, R <abbrgrp><abbr bid="B17">17</abbr></abbrgrp>, C++ and MySQL and Linux Bash scripts. The subprograms identify unique CEL files by using cyclic redundancy checks, preprocess the files, perform the normalization steps, fetch gene annotations from Ensembl and incorporate the manually made annotation for each sample, create a complete MySQL database and perform the final integrity checks. Visualization and analysis tools were implemented in R <abbrgrp><abbr bid="B17">17</abbr></abbrgrp>, and the processed data are made available through a user-friendly and interactive web site <abbrgrp><abbr bid="B16">16</abbr></abbrgrp>. We also implemented a virtual machine approach, the final result being a hardware-independent and rapidly installable complete operating system optimized for running the GeneSapiens database and web-server for the visualization interface.</p>
         </sec>
         <sec>
            <st>
               <p>Development of the data normalization procedure</p>
            </st>
            <p>We implemented a three-step normalization strategy that consisted of probe-level preprocessing, equalization transformation (Q) and array-generation-based gene centering (AGC). We demonstrate that these steps resulted in data that are comparable across the major Affymetrix array generations.</p>
            <sec>
               <st>
                  <p>Step I: data preprocessing at the probe level</p>
               </st>
               <p>We first used the MAS5.0 method <abbrgrp><abbr bid="B18">18</abbr></abbrgrp> to preprocess raw data in the .CEL files. MAS5.0 is an optimal algorithm for the purpose of analyzing very large datasets <abbrgrp><abbr bid="B19">19</abbr></abbrgrp> as it requires less memory than other widely used methods, and the biological representativity of the MAS5.0 normalized data is well documented <abbrgrp><abbr bid="B19">19</abbr></abbrgrp>. In the three-step normalization approach, the subsequent normalization stages also minimized possible problems generated by the MAS5.0 preprocessing algorithm.</p>
               <p>Importantly, we mapped the probes from each array generation type directly to Ensembl gene IDs by using alternative CDF files (version 10) <abbrgrp><abbr bid="B20">20</abbr></abbrgrp> to avoid inaccuracies generated by the original probeset design of Affymetrix arrays. Therefore, this resulted in the optimal redefinition of the gene specificities of the probes and excluded those probes that, according to the recent genome assembly, mapped to multiple genes or nowhere in the genome.</p>
            </sec>
            <sec>
               <st>
                  <p>Step II: Q normalization</p>
               </st>
               <p>After preprocessing, we performed sample-wise normalization of the entire dataset at the gene level. This was done by equalization transformation <abbrgrp><abbr bid="B21">21</abbr></abbrgrp> (Q), which is similar to the widely used quantile normalization <abbrgrp><abbr bid="B22">22</abbr></abbrgrp> in which the samples are transformed by substituting their values with the means of quantiles in the entire dataset. In the Q procedure, we transformed each sample to follow a normal distribution that was estimated from the log<sub>2</sub>-transformed values of the entire dataset (Additional data file 1). The estimated parameters were a mean of 8 and standard deviation of 2. This step of the sample-wise normalization was necessary to prevent a small number of aberrant samples from dominating the mean values for genes within an array generation used in the AGC correction.</p>
            </sec>
            <sec>
               <st>
                  <p>Step III: array-generation-based gene centering (AGC normalization)</p>
               </st>
               <p>We developed a novel AGC method to avoid the bias caused by the different oligos quantifying the same gene in the different Affymetrix array generations. The AGC method is based on the availability of data, on each array generation, from a large number of samples representing different tissues or diseases. In the AGC method, a correction factor is calculated for each gene in each array generation. These correction factors are then used to normalize the gene expression distributions across the whole database (see Materials and methods for details).</p>
            </sec>
         </sec>
         <sec>
            <st>
               <p>Validating the entire normalization protocol</p>
            </st>
            <p>We validated the AGC method as well as the entire normalization procedure by a number of ways and demonstrated that we had achieved improved comparability of the data across the multiple array generations. First, analysis by multi-dimensional scaling (MDS) showed that samples from 15 normal human tissues tested clustered initially based on the four array generations (Figure <figr fid="F1">1a, b</figr>), but after the AGC procedure, the tissue of origin was the primary driver of the clustering (Figure <figr fid="F1">1c, d</figr>). Second, in K-means clustering of the same data, we showed that the corrected rand index <abbrgrp><abbr bid="B23">23</abbr></abbrgrp> (a measure of the accuracy of the sample segregation into characteristic clusters) for array generations decreased from 0.45 to 0.15 and that of the tissues jumped from 0.22 to 0.92 (Additional data file 5). Third, correlation of data from two large datasets where the same samples had been analyzed on two different array generations improved significantly after the AGC correction (Figure <figr fid="F2">2</figr>), reaching across-generation correlations of 0.9. Finally, and most importantly, we showed that the gene profiles of multiple previously known tissue-specific genes matched exactly with those expected based on literature data. Therefore, we expect poorly known genes to provide similarly informative results on their biological and medical importance. These various validation steps are described in more detail below.</p>
            <fig id="F1">
               <title>
                  <p>Figure 1</p>
               </title>
               <caption>
                  <p>Multidimensional scaling (MDS) of Q normalized data before and after AGC correction</p>
               </caption>
               <text>
                  <p>Multidimensional scaling (MDS) of Q normalized data before and after AGC correction. MDS was performed using 1,137 healthy <it>in vivo </it>samples representing 15 tissue categories with 7,390 genes in common without missing values. Color codes show the array generation of each sample for panles on the left-hand side and the high level anatomical system from which samples originate for panels on the right-hand side. <b>(a, b) </b>Clustering of samples in Q normalized data without AGC correction. (a) Clustering driven dominantly by the array generations, but some biological division can be seen in the form of some division within the large clusters. (b) Several tissue classes are separated into two or more clusters due to the different array generation of origin. <b>(c, d) </b>After QAGC, array generations no longer define clusters (c) but instead tissue types form distinct clusters (d).</p>
               </text>
               <graphic file="gb-2008-9-9-r139-1"/>
            </fig>
            <fig id="F2">
               <title>
                  <p>Figure 2</p>
               </title>
               <caption>
                  <p>Boxplots of correlations between the replicated samples after each step of the data normalization process</p>
               </caption>
               <text>
                  <p>Boxplots of correlations between the replicated samples after each step of the data normalization process. All boxes for which notches do not overlap vertically have significantly (&#945; = 0.05) different median values. On the left is a sample set from 14 human muscle biopsy samples measured with array generations U95Av2 and U133A. The correlations computed based on the QAGC-normalized data are significantly higher when compared to MAS5 and Q methods. On the right, all correlations between 123 leukemia samples are plotted. The samples are from three different array generations U95Av2, U133A, and U133B. The first column illustrates correlations between all replicates together (369 correlation values), and in the other columns the correlations are grouped based on the array generation pairs. When the mean values of the correlations computed with each method were compared, the values in the QAGC data were significantly higher.</p>
               </text>
               <graphic file="gb-2008-9-9-r139-2"/>
            </fig>
            <sec>
               <st>
                  <p>Multi-dimensional scaling analysis</p>
               </st>
               <p>We applied MDS <abbrgrp><abbr bid="B24">24</abbr></abbrgrp> to the data processed by Q normalization alone or after the AGC correction. This was done to compare the variability (that is, noise) caused by the array generation with the biological variability in the data. We evaluated 1,137 healthy tissue samples having 7,390 genes in common without any missing values. The samples represented 15 distinct anatomical locations with more than 20 samples from each site. The samples were measured with four array generations (HG-U133A Plus 2, HG-U133A, HG-U95A and HG-U95Av2). In Q normalized data, only some tissue-associated variation could be observed (Figure <figr fid="F1">1a</figr>), while the clusters were primarily driven by the array generations (Figure <figr fid="F1">1b</figr>). After the AGC step was applied, a major change in the clustering of the samples was seen. Array generations no longer defined clusters (Figure <figr fid="F1">1c</figr>), which were now formed predominantly by the tissue types (Figure <figr fid="F1">1d</figr>). The effect was very striking and defined, for example, a clear cluster of neuronal, muscle, hematological and lung tissues. Even though the MDS in three dimensions gives an illustrative example of the segregation of these 15 tissues types, we do not expect the clusters to be completely separated with MDS and only three dimensions. The main reason is that there is significant biological similarity as well as biological variability within each tissue type (such as multiple overlapping cell types). However, this analysis was not meant to provide a demonstration of complete classification accuracy of human tissues but rather to validate the biological relevance of our data. Taken together, the analysis indicates clear improvement in overall biological relevance of the data after our three-step normalization procedure.</p>
            </sec>
            <sec>
               <st>
                  <p>K-means clustering</p>
               </st>
               <p>We clustered the data before and after normalization with four initial centroids using the median values of each array generation, and again with 15 initial centroids using the median values of each tissue type. This test was done for the specific purpose of comparing the impact of the variation generated by the array generations before and after normalization. We calculated the corrected rand indices <abbrgrp><abbr bid="B23">23</abbr></abbrgrp> for each clustering to see whether the array generations or the tissue types form more accurate clusters. The corrected rand index compares partitions defined by the K-means clustering to the known partitions of the data (for example, partitions by array generation or by tissue type). The index varies between [1, 0] where one indicates that the partitions are identical and not due to chance, whereas zero indicates that the found partitions would be expected by chance. The corrected rand index for the array generations went down from 0.45 to 0.15 when we applied the AGC normalization, while the corrected rand index for tissues jumped from 0.22 to 0.92. The percentages of samples per array generation and per tissue type segregated to the distinct clusters are given in Additional data file 5.</p>
               <p>We also tested the impact of the Q normalization step by performing the same clustering operations on AGC corrected MAS5 data. In this case, the corrected rand index for array generations was 0.11 and for tissue types 0.84. This result showed that AGC could also significantly improve MAS5 data even without the Q normalization, but that the three consecutive steps provided the optimal ability to distinguish biologically relevant signals.</p>
            </sec>
            <sec>
               <st>
                  <p>Correlations of technical replicates</p>
               </st>
               <p>We then studied the correlations between technical replicates of the same samples analyzed on different Affymetrix array generations. While in itself this does not ensure optimal normalization, such analyses have often been used to compare data from different array generations in previous publications <abbrgrp><abbr bid="B4">4</abbr><abbr bid="B9">9</abbr><abbr bid="B25">25</abbr></abbrgrp>. Thus, we used data from three datasets as a basis for these analyses <abbrgrp><abbr bid="B9">9</abbr><abbr bid="B26">26</abbr><abbr bid="B27">27</abbr></abbrgrp>. We first used data for 14 samples of human muscle biopsy samples from patients with inflammatory myopathies <abbrgrp><abbr bid="B9">9</abbr></abbrgrp>. For these cases, data from hybridizations on both HG-U95Av2 and HG-U133A human arrays were available. The correlation coefficient of each replicate pair was > 0.9 when normalized with the AGC method compared to the correlation of the preprocessed and Q normalized values, which were less than 0.75, a significant difference (Figure <figr fid="F2">2a</figr>). We then utilized a dataset from St Jude Children's Research Hospital <abbrgrp><abbr bid="B26">26</abbr><abbr bid="B27">27</abbr></abbrgrp> of 123 human leukemias, each analyzed with the three array generations; HG-U95Av2, HG-U133A and HG-U1331B. The mean value of the correlations computed based on the AGC corrected data was significantly higher, 0.78, than the mean of correlations computed based on pre-processed or Q normalized values, which was 0.5 (Figure <figr fid="F2">2b</figr>). For most comparisons, the Q normalized correlations were also slightly higher than those with pre-processing alone.</p>
               <p>In summary, validation of the normalization approach (Figures <figr fid="F1">1a&#8211;d</figr>, <figr fid="F2">2a, b</figr>; Additional data file 5) together indicate that, in our three-step data processing procedure, the samples clustered mainly according to array generation, until the last AGC correction is applied. After the last AGC step, the biological origin of the samples, and not the array generation, drove the clustering (Figure <figr fid="F1">1d</figr>). Therefore, our <it>in silico </it>transcriptomics data have been integrated across all the array generations to the extent that biological variability caused by the tissue and disease types will exceed the technical noise caused by the array generations. This does not mean that the differences between array generations are non-existent, but they will be smaller than most of the biological differences. The final and most important validation of the method was the demonstration that known tissue-and disease-specific genes generated expected profiles across all tissues and diseases (see examples below), thus validating that technical variation is diminished enough to allow accurate biological findings to be made.</p>
            </sec>
         </sec>
         <sec>
            <st>
               <p>Validating GeneSapiens expression profiles with known tissue-specific genes</p>
            </st>
            <p>To evaluate the biological relevance of gene expression profiles from <it>in silico </it>transcriptomics data, we generated tissue- and disease-wide expression profiles for well-known tissue-specific marker genes. Figure <figr fid="F3">3</figr> provides examples of the GeneSapiens plots for <it>TNNT2</it>, <it>ALPP </it>and <it>MAG</it>. In these plots, all the 9,783 samples are represented along the x-axis in a pre-determined fixed order, first the normal tissues, then cancers and then other diseases. The y-axis reflects the relative level of gene expression after the three-step normalization approach.</p>
            <fig id="F3">
               <title>
                  <p>Figure 3</p>
               </title>
               <caption>
                  <p>Detailed expression profiles of <it>TNNT2</it>, <it>ALPP </it>and <it>MAG</it></p>
               </caption>
               <text>
                  <p>Detailed expression profiles of <it>TNNT2</it>, <it>ALPP </it>and <it>MAG</it>. <b>(a) </b><it>TNNT2 </it>is a clinically used cardiac biomarker and, as expected, it shows heart-specific expression. In addition, it has been shown that <it>TNNT2 </it>has elevated expression in some cases of rhabdomyosarcoma, also visible from the profile. <b>(b) </b><it>ALPP </it>had high expression in placenta and somewhat elevated expression in uterine tumors. Additionally, serous ovarian tumors showed elevated expression when compared to the mucinous ones. <b>(c) </b>Known neuronal marker gene <it>MAG </it>similarly shows an expression profile that was highly central nervous system specific.</p>
               </text>
               <graphic file="gb-2008-9-9-r139-3"/>
            </fig>
            <p>Troponin T (<it>TNNT2</it>) showed highly specific expression in heart tissue, as expected for a clinically used cardiac biomarker <abbrgrp><abbr bid="B28">28</abbr></abbrgrp> (Figure <figr fid="F3">3a</figr>). Heart samples in our database originate from four different array generations and comprise only 0.5% of the samples. Therefore, finding an expected tissue-specific expression profile for these samples demonstrates the performance of the normalization even for such a small proportion of samples measured on multiple array generations. Interestingly, <it>TTNT2 </it>is also rather highly expressed in many rhabdomyosarcomas and some Muellerian ovarian tumors. There is one report in the literature for a single case of rhabdomyosarcoma showing increased Troponin T levels in serum <abbrgrp><abbr bid="B29">29</abbr></abbrgrp>, while our GeneSapiens profile demonstrated that this gene is indeed likely to be upregulated in the two aforementioned tumor types. This demonstrates how GeneSapiens profiles can give additional information even from well-known genes. Expression of placental alkaline phosphatase (PLAP; <it>ALPP</it>) was seen predominantly in healthy placenta (Figure <figr fid="F3">3b</figr>), as expected <abbrgrp><abbr bid="B30">30</abbr></abbrgrp>, but also often in tumors of the uterus and ovary and rarely in some other tumor types. This observation fits well with the known oncodevelopmental nature of PLAP, with ectopic expression being common in various types of cancers, with uterine and ovarian cancers being particularly well defined as PLAP-positive <abbrgrp><abbr bid="B31">31</abbr><abbr bid="B32">32</abbr></abbrgrp>. Finally, <it>MAG</it>, a neuronal cell marker <abbrgrp><abbr bid="B33">33</abbr></abbrgrp>, showed the highest expression in central nervous system, and to a lesser extent in gliomas (Figure <figr fid="F3">3c</figr>), again a GeneSapiens profile that could be expected for this well-known marker gene.</p>
            <p>Additional examples are given in Additional data files 4 and 5, and dozens of known tissue-specific genes or biomarkers can be evaluated through the online tool for exploring tissue- and disease-specific gene expression patterns. For example, <it>KLK3 </it>(PSA) is the best-known prostate-specific gene <abbrgrp><abbr bid="B34">34</abbr></abbrgrp> and its GeneSapiens expression profile (Additional data file 2) showed expression only in normal and cancerous human prostate tissues. GFAP is a glial fibrillar acidic protein and showed the expected <abbrgrp><abbr bid="B35">35</abbr></abbrgrp> high level of expression in normal and pathological tissues from the central nervous system (Additional data file 2). Insulin shows the expected extremely pancreas-specific expression (Additional data file 6). LDHC, a known germ-cell specific marker <abbrgrp><abbr bid="B36">36</abbr></abbrgrp>, showed a strong testis-specific expression profile (Additional data file 6).</p>
            <p>GeneSapiens makes it possible to generate gene expression profiles for 17,330 genes across 175 systematically annotated human tissues in a uniform scale with 2,265 to 9,783 data points per gene. Due to the breadth of the tissue and disease spectrum, this kind of analysis provides novel insights into the biological, medical and clinical associations of genes. Furthermore, the expression levels of a given gene can be compared across all normal tissues and all disease types, not just between specific test and control samples (like normal and tumor tissues from the same organ as is usually done). Figure <figr fid="F4">4a, b</figr> illustrate the power of this global tissue- and disease-wide analysis, displaying the expression profile of the <it>PRAME </it>gene. <it>PRAME </it>(preferentially expressed melanoma antigen) showed high expression in normal testis, but was very highly over-expressed in a large variety of human cancers. <it>PRAME </it>over-expression has been previously described in many cancer forms <abbrgrp><abbr bid="B37">37</abbr></abbrgrp> and is known to function as a dominant repressor of retinoic acid receptor signaling <abbrgrp><abbr bid="B37">37</abbr></abbrgrp>.</p>
            <fig id="F4">
               <title>
                  <p>Figure 4</p>
               </title>
               <caption>
                  <p>Detailed gene expression profile of <it>PRAME</it></p>
               </caption>
               <text>
                  <p>Detailed gene expression profile of <it>PRAME</it>. <b>(a) </b>Body-wide expression profile of the <it>PRAME </it>gene across the database. Each dot represents the expression of <it>PRAME </it>in one sample. Anatomical origins of each sample are marked with colored bars below the gene plot. Sample types having higher than average expression or an outlier expression profile are additionally colored in the figure (legend at the top left corner). The <it>PRAME </it>gene is a highly testis-specific gene in normal samples, but is ectopically expressed across the majority of human cancers. Gene plots like these can easily be used to identify outlier expression profiles, like as can be seen for kidney cancer in this case, where only a small fraction of the tumors are <it>PRAME </it>positive. <b>(b) </b>Box plot analysis of the <it>PRAME </it>expression levels across a variety of normal and cancer tissues. The number of samples in each category is shown in parentheses. Normal tissues are shown with green boxes and cancerous ones with red boxes. The box refers to the quartile distribution (25-75%) range, with the median shown as a black horizontal line. In addition, the 95% range and individual outlier samples are shown.</p>
               </text>
               <graphic file="gb-2008-9-9-r139-4"/>
            </fig>
         </sec>
         <sec>
            <st>
               <p>'Body-map' analysis to visualize expression profiles for groups of genes across all tissues and diseases</p>
            </st>
            <p>To illustrate the power of GeneSapiens analysis in the study of gene expression profiles of human cancer genes (as defined by Sanger Center human cancer gene census), we produced a clustered map of the mean expression levels of 342 cancer genes across 110 healthy and malignant human tissues (Figure <figr fid="F5">5</figr>). Clustering along the sample type (y-axis) revealed that based on the expression profiles of these cancer genes, the samples could be divided into three overall classes: solid tumors (84.4% of sample types were malignant in this class), normal tissues (82.1% of sample types were healthy in this class) and hematological samples (100% sample types were normal or malignant hematological samples in this class). Thus, the group of classic cancer genes had distinctly different expression between healthy and malignant solid tissues, but in hematological samples, cancer and normal samples could not be separated.</p>
            <fig id="F5">
               <title>
                  <p>Figure 5</p>
               </title>
               <caption>
                  <p>Body-wide expression map of known cancer genes</p>
               </caption>
               <text>
                  <p>Body-wide expression map of known cancer genes. On the x-axis are 342 genes and on the y-axis are 110 <it>in vivo </it>tissues (both healthy and malignant) from human. The color indicates the mean expression value of each gene in each tissue. Grey color signifies missing values. Values have been gene-wise scaled (mean 0 and standard deviation 1). Both axes have been clustered by using Euclidean distance with complete linkage method. Below the expression map are gene-wise Pearson correlation coefficients with four known cellular process/tissue-specific marker genes (<it>Ki-67</it>, <it>PCNA</it>, <it>KRT19 </it>and <it>PTPRC</it>). Correlations have been calculated over 8,409 healthy and malignant samples using pairwise complete observations. Comparison of highest correlation values and clusters of genes on the expression map confirm that through the analysis of <it>in silico </it>transcriptomics data it is possible to find both tissue specificity and functional associations with processes such as cell cycle. For example, the orange colored branch contains genes having highest correlation with epithelial marker <it>KRT19</it>, branches colored blue contain genes mostly expressed in the hematological system and they also correlate with <it>PTPRC</it>, a marker for hematological tissues. Additionally, genes related to mitosis cluster together (purple branch), having highest correlations with <it>Ki-67 </it>and <it>PCNA</it>. The rectangles (A, B, C) highlight three genes as examples of extreme expression in some cancers (see Figure 6 and Additional data files 7 and 8 for enlargements of these areas).</p>
               </text>
               <graphic file="gb-2008-9-9-r139-5"/>
            </fig>
            <p>Clustering of the cancer genes according to their mean body-wide expression profiles revealed five characteristic subgroups. Expression of <it>MKI67 </it>(Ki-67) <abbrgrp><abbr bid="B38">38</abbr></abbrgrp> and <it>PCNA </it><abbrgrp><abbr bid="B39">39</abbr></abbrgrp> genes, two cell proliferation markers, showed the highest correlations with specific branches of the cancer genes (Figure <figr fid="F5">5</figr>, purple branch). <it>KRT19 </it>(a known epithelial marker) <abbrgrp><abbr bid="B40">40</abbr></abbrgrp> and <it>PTPRC </it>(an established marker for hematopoiesis) <abbrgrp><abbr bid="B41">41</abbr></abbrgrp> revealed a correlation with genes in the orange and blue branches. Genes most highly associated with proliferation markers were clearly the ones with gain of expression in solid malignant tissues. The branch colored red contained enrichments of Gene Ontology classes <abbrgrp><abbr bid="B42">42</abbr><abbr bid="B43">43</abbr></abbrgrp> related to differentiation, cell adhesion and catabolic processes (data not shown), which fits with the tendency for down-regulation of this group of cancer genes in malignant tumors.</p>
            <p>This kind of body-wide expression map of genes can also be used to pinpoint medically interesting associations for individual genes (three examples marked with rectangles and labeled A, B and C). <it>KIT </it>had the highest GeneSapiens expression level in gastrointestinal stromal tumors (GISTs; Figure <figr fid="F5">5</figr>, rectangle A, and Figure <figr fid="F6">6</figr>). <it>KIT </it>is a key therapeutic target of Gleevec in GIST tumors <abbrgrp><abbr bid="B44">44</abbr></abbrgrp>. The body-wide expression profiles of GeneSapiens would have therefore readily identified this association of <it>KIT </it>with GIST samples along with this therapeutic opportunity.</p>
            <fig id="F6">
               <title>
                  <p>Figure 6</p>
               </title>
               <caption>
                  <p>Expression profile for the <it>KIT </it>gene shows interesting patterns in the bodymap in Figure 5</p>
               </caption>
               <text>
                  <p>Expression profile for the <it>KIT </it>gene shows interesting patterns in the bodymap in Figure 5. <it>KIT </it>exhibits extremely high expression in gastrointestinal stromal tumors. <it>KIT </it>is known to be inhibited by Gleevec<sup>&#174;</sup>, demonstrating that findings like these pinpoint immediate possibilities for drug repositioning.</p>
               </text>
               <graphic file="gb-2008-9-9-r139-6"/>
            </fig>
            <p>The second example is <it>FEV </it>(Figure <figr fid="F5">5</figr>, rectangle B, and Additional data file 7), a gene known to have functions in healthy nervous system. This ETS-family transcription factor showed low, but detectable, expression in healthy central nervous system and in prostate. In malignant tissues <it>FEV </it>had highly elevated expression in synovial sarcoma, neuroblastoma, malignant peripheral nerve sheath tumors, and small intestinal adenocarcinoma, and somewhat elevated expression in prostate cancer.</p>
            <p>The third example of cancer gene profiles is <it>C1orf56</it>, also known as <it>AF1Q </it>or <it>MLLT11 </it>(Figure <figr fid="F5">5</figr>, rectangle C, and Additional data file 8). In healthy tissues it was expressed only in the nervous system, but in malignant tissues there was gain of expression in T-cell acute lymphoid leukemia, Ewing sarcoma, lung small cell cancer, and nephroblastoma, and extreme overexpression in neuroblastoma. <it>MLLT11 </it>is known to be fused to the <it>MLL </it>gene in acute leukemias <abbrgrp><abbr bid="B45">45</abbr></abbrgrp>. This raises the possibility that <it>MLLT11 </it>could be a fusion gene target <abbrgrp><abbr bid="B46">46</abbr><abbr bid="B47">47</abbr></abbrgrp> or undergoing activating mutations in a range of tumor types. Alternatively, the high levels of expression in these tumors suggest that this gene is often activated in cancer by other mechanisms.</p>
         </sec>
      </sec>
      <sec>
         <st>
            <p>Conclusion</p>
         </st>
         <p>The major advantage of the GeneSapiens data mining methodology is that it provides an integrated view of human gene expression levels across thousands of samples representing hundreds of different tissue and disease types. GeneSapiens offers unprecedented possibilities to study gene expression levels not only between a particular tumor type and the corresponding normal tissue, but by providing body-wide overviews of gene expression levels across all kinds of normal and disease states. While meta-analysis of microarray data <abbrgrp><abbr bid="B48">48</abbr><abbr bid="B49">49</abbr></abbrgrp> has been previously demonstrated to be powerful in taking advantage of the enormous amounts of publicly available data <abbrgrp><abbr bid="B1">1</abbr><abbr bid="B2">2</abbr><abbr bid="B50">50</abbr></abbrgrp> most existing methods, such as Oncomine <abbrgrp><abbr bid="B2">2</abbr></abbrgrp> and Genvestigator <abbrgrp><abbr bid="B51">51</abbr></abbrgrp>, are based on the analysis of one study at a time. Others, like the Celsius resource, provide the analysis option on one Affymetrix array generation only, therefore providing data from a more limited spectrum of tissues and diseases. In comparison, GeneSapiens provides insights on 'body- and disease-wide' expression of 17,330 genes in approximately 10,000 human samples. Its value is evidenced by the capturing of much of the known data on biological and medical associations for several tissue-specific marker genes (Figures <figr fid="F3">3</figr>, <figr fid="F4">4</figr>, <figr fid="F5">5</figr>, <figr fid="F6">6</figr>), as well as in providing new insights on even well-studied cancer genes. GeneSapiens is characterized by detailed anatomical, histopathological and clinical annotations of disease states, a critically important feature that is often missing in other more generic gene expression database projects.</p>
         <p>Virtually every gene we have studied in GeneSapiens has had a distinct pattern of expression across the thousands of samples. Hence, GeneSapiens provides systematic biological and medical annotation of individual human genes, which could prove useful even in the case of relatively well-known and abundantly studied cancer genes. For example, the fact that by far the highest levels of <it>KIT </it>expression across all samples available were seen in GISTs demonstrates that one could identify key driver genes that are mutated or otherwise activated in human cancers and could, therefore, be of significant therapeutic significance. This high level of overexpression of <it>KIT </it>in GISTs probably reflects the selection pressure favoring the expression of this gene during clonal cancer evolution. GeneSapiens provides the exciting possibility that one could find other previously unknown cancer genes with a similar profile of high expression in one or a few cancer types only that could also turn out to be driven by mutations or translocations <abbrgrp><abbr bid="B47">47</abbr></abbrgrp>. Conversely, even though we will see more and more mutational data being generated from selected human cancers, understanding the impact of the mutations on gene expression will be important. Furthermore, it is extremely useful to be able to characterize the expression of these 'cancer genes' across thousands of cancers and normal tissues of different origins, as sequencing is typically done from a highly selected group of samples. This is illustrated by our analysis of the expression profiles for <it>FEV </it>and <it>C1orf56 </it>(<it>MLLT11)</it>. Besides the therapeutic importance, the data on several serum biomarkers of disease, such as Troponin T and PSA, indicate that the body-wide expression profiles of genes could highlight genes with a high specificity to a single organ or disease type, and, therefore, with potential value as serum biomarkers.</p>
         <p>The third important aspect of the GeneSapiens system is the interactive nature of the analysis options that we have generated for making these data publicly available in a user-friendly format. We have set up an interactive website <abbrgrp><abbr bid="B16">16</abbr></abbrgrp> to provide access to the <it>in silico </it>transcriptomics data with detailed expression profiles for 17,330 genes across all the 9,783 annotated healthy and pathological human samples. We provide the possibility to analyze the levels of gene expression across all the tissues and malignant diseases (box-and-whisker plots; Figure <figr fid="F3">3a&#8211;d</figr>), as well as to analyze gene expression at the level of individual samples. The 'GeneSapiens plot' (see, for example, Figure <figr fid="F4">4a</figr>) displays expression levels of the genes in each of the 10,000 samples, arranged in anatomical order and by disease type. The datapoints displayed are interactive and provide links to the specific type of the sample, the histopathological diagnosis and the type of the array generation used. We also provide filtered analysis options where users can explore in detail a particular organ or disease type as well as the option of analyzing the correlation of any two genes across the whole database or subsets of tissues or diseases. Taken together, we believe that the GeneSapiens analysis system provides a highly useful resource to the biomedical research community.</p>
      </sec>
      <sec>
         <st>
            <p>Materials and methods</p>
         </st>
         <sec>
            <st>
               <p>Data collection</p>
            </st>
            <p>This <it>in silico </it>collection of human transcriptomes was constructed by collecting 9,783 publicly available Affymetrix microarray experiments in the form of CEL files as source material. The uniqueness of the collected files was tested with the cyclic redundancy check algorithm (cksum). For a complete listing of the original source data from 157 separate studies, please see Additional data file 3. We combined data from the following Affymetrix generations (HG-U95A, HG-U95Av2, HG-U133A, HG-U133B, HG-U133 Plus 2). Even though HG-U133A and HG-U133B are not different generations, they do have 2,074 common genes, and we considered them as such for the practical purposes of our normalization.</p>
         </sec>
         <sec>
            <st>
               <p>Data preprocessing</p>
            </st>
            <p>Data from all CEL files were pre-processed with the MAS5.0 algorithm <abbrgrp><abbr bid="B18">18</abbr></abbrgrp> with default parameters. Although different opinions exist about optimal preprocessing methods <abbrgrp><abbr bid="B52">52</abbr></abbrgrp>, recent comparison studies indicate that MAS5.0 provides the most faithful cellular network construction <abbrgrp><abbr bid="B53">53</abbr></abbrgrp> and optimal identification of differentially expressed genes <abbrgrp><abbr bid="B54">54</abbr></abbrgrp>. In addition, other preprocessing methods may create false positive results <abbrgrp><abbr bid="B53">53</abbr><abbr bid="B55">55</abbr><abbr bid="B56">56</abbr></abbrgrp>. We used version 10 of the alternative CDF files <abbrgrp><abbr bid="B20">20</abbr></abbrgrp> summarizing the probe level intensities directly to the Ensemble <abbrgrp><abbr bid="B57">57</abbr></abbrgrp> gene IDs (Ensembl build 46). Probes mapping to multiple genes and other problems associated with old generations of Affymetrix probe designs were thereby excluded. Within our normalization process the term pre-processing refers only to steps performed by the MAS5.0 algorithm, and subsequent normalization steps are described below.</p>
         </sec>
         <sec>
            <st>
               <p>Sample-wise normalization with equalization transformation</p>
            </st>
            <p>We utilized equalization transformation (Q) <abbrgrp><abbr bid="B21">21</abbr></abbrgrp>, a method similar to widely used quantile normalization <abbrgrp><abbr bid="B22">22</abbr></abbrgrp>, to normalize the pre-processed data. After Q normalization, the dataset had the desired distribution that has been determined prior to transformation. The normal distribution with mean of 8 and standard deviation 2 (<it>N</it>(<it>8, 4</it>)) was selected as the desired distribution since the distribution of logarithmic, preprocessed values of all samples (N = 9,783) with median 7.92 and standard deviation 2.3 was near to this distribution (Additional data file 1). EQ values were brought to exponential scale to maintain the scale of the original values.</p>
            <p>The quantile normalization <abbrgrp><abbr bid="B22">22</abbr></abbrgrp> would be another choice to perform normalization but has considerable drawbacks in this particular setting. First, it does not perform well when there is variation in the number of genes between samples. This problem is magnified when merging thousands of samples from different array generations. Also, the means of the quantiles may vary substantially when new samples are added to the dataset, whereas the change caused by the equalization transformation is smaller. Quantile normalization is also resource-intensive to compute for thousands of samples with different numbers of measured genes. Thus, equalization transformation (Q) <abbrgrp><abbr bid="B21">21</abbr></abbrgrp> was the method of choice in this study.</p>
         </sec>
         <sec>
            <st>
               <p>Array-generation-based gene centering (AGC)</p>
            </st>
            <p>To be able to compare the samples of <it>in silico </it>transcriptomics also between the array generations, we developed a novel method for gene-wise normalization of the data. In this AGC method we assume that the mean of the expression values for any particular gene in each array generation is the same. If the mean value of some of the array generations differs substantially from the others, the shift is assumed to be caused by the array generation based variation, and the AGC method aims to correct this variation. The AGC method requires that the collection of samples to be analyzed is large enough so that one can assume the distribution of values of each gene <it>k </it>to represent the total distribution of all potential expression values across all tissues for each array generation <it>i</it>. Therefore, the AGC method normalizes the data to have mean values <it>&#956;</it><sub><it>i, k</it></sub><it>= &#956;</it><sub><it>all, k </it></sub>for all array generations <it>i</it>, where <it>&#956;</it><sub><it>all, k </it></sub>is the mean of all values of the gene <it>k</it>. Further, it is assumed that the minimum and the maximum estimates for the gene value are reached and the range of the gene <it>k </it>should approximately be [<it>a</it><sub><it>k</it>, </sub><it>b</it><sub><it>k</it></sub>], where <it>a</it><sub><it>k </it></sub>is the lowest 2% value and <it>b</it><sub><it>k </it></sub>is the largest 2% value of gene <it>k</it>. AGC values should not go over this range. However, if the new centered value exceeds the range, the difference is diminished towards the range limits with coefficient <it>c</it>, 0 &#8804; <it>c </it>&#8804; 1. Here, the coefficient is set to <it>c </it>= 1/5. Coefficient <it>c </it>is necessary to prevent some extremely tissue-specific genes from having arbitrarily large correction factors, which is possible if the specific tissue is absent from one or more array generation. The coefficient <it>c </it>affects 2.9% of all correction factors. Of those cases, the proportion of the correction factor modified by coefficient <it>c </it>was, on the average, 7.6%. Thus, the coefficient <it>c </it>affected an extreme minority of the corrections in a significant manner, but nevertheless, it was found to be crucial for the AGC method. The centered values can now be obtained with:</p>
            <p>
               <display-formula>
                  <m:math xmlns:m="http://www.w3.org/1998/Math/MathML" name="gb-2008-9-9-r139-i1">
                     <m:semantics>
                        <m:mrow>
                           <m:msub>
                              <m:mover accent="true">
                                 <m:mi>x</m:mi>
                                 <m:mo>^</m:mo>
                              </m:mover>
                              <m:mrow>
                                 <m:mi>i</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>j</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>k</m:mi>
                              </m:mrow>
                           </m:msub>
                           <m:mo>=</m:mo>
                           <m:msub>
                              <m:mi>x</m:mi>
                              <m:mrow>
                                 <m:mi>i</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>j</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>k</m:mi>
                              </m:mrow>
                           </m:msub>
                           <m:mo>&#8722;</m:mo>
                           <m:mo stretchy="false">(</m:mo>
                           <m:msub>
                              <m:mi>&#956;</m:mi>
                              <m:mrow>
                                 <m:mi>i</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>k</m:mi>
                              </m:mrow>
                           </m:msub>
                           <m:mo>&#8722;</m:mo>
                           <m:msub>
                              <m:mi>&#956;</m:mi>
                              <m:mrow>
                                 <m:mi>a</m:mi>
                                 <m:mi>l</m:mi>
                                 <m:mi>l</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>k</m:mi>
                              </m:mrow>
                           </m:msub>
                           <m:mo stretchy="false">)</m:mo>
                        </m:mrow>
                        <m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaagaart1ev2aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8GiVeY=Pipec8Eeeu0xXdbba9frFj0xb9Lqpepeea0xd9q8qiYRWxGi6xij=hbbc9s8aq0=yqpe0xbbG8A8frFve9Fve9Fj0dmeaabaqaciGacaGaaeqabaqabeGadaaakeaaceWG4bGbaKaadaWgaaWcbaGaamyAaiaacYcacaWGQbGaaiilaiaadUgaaeqaaOGaeyypa0JaamiEamaaBaaaleaacaWGPbGaaiilaiaadQgacaGGSaGaam4AaaqabaGccqGHsislcaGGOaGaeqiVd02aaSbaaSqaaiaadMgacaGGSaGaam4AaaqabaGccqGHsislcqaH8oqBdaWgaaWcbaGaamyyaiaadYgacaWGSbGaaiilaiaadUgaaeqaaOGaaiykaaaa@4A1C@</m:annotation>
                     </m:semantics>
                  </m:math>
               </display-formula>
            </p>
            <p>where <it>x</it><sub><it>i</it>, <it>j</it>, <it>k </it></sub>is the value of gene <it>k </it>in sample <it>j </it>from array generation <it>i</it>, <it>&#956;</it><sub><it>i</it>, <it>k </it></sub>is the mean of the values of gene <it>k </it>across array generation <it>I</it>, and <it>&#956;</it><sub><it>all</it>, <it>k </it></sub>is the mean of the values of gene <it>k </it>across all array generations. Further, the adjusted values are computed based on the equation:</p>
            <p>
               <display-formula>
                  <m:math xmlns:m="http://www.w3.org/1998/Math/MathML" name="gb-2008-9-9-r139-i2">
                     <m:semantics>
                        <m:mrow>
                           <m:msub>
                              <m:mi>y</m:mi>
                              <m:mrow>
                                 <m:mi>i</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>j</m:mi>
                                 <m:mo>,</m:mo>
                                 <m:mi>k</m:mi>
                              </m:mrow>
                           </m:msub>
                           <m:mo>=</m:mo>
                           <m:mrow>
                              <m:mo>{</m:mo>
                              <m:mrow>
                                 <m:mtable columnalign="left">
                                    <m:mtr columnalign="left">
                                       <m:mtd columnalign="left">
                                          <m:mrow>
                                             <m:msub>
                                                <m:mi>b</m:mi>
                                                <m:mi>k</m:mi>
                                             </m:msub>
                                             <m:mo>+</m:mo>
                                             <m:mi>c</m:mi>
                                             <m:mo stretchy="false">(</m:mo>
                                             <m:msub>
                                                <m:mover accent="true">
                                                   <m:mi>x</m:mi>
                                                   <m:mo>^</m:mo>
                                                </m:mover>
                                                <m:mrow>
                                                   <m:mi>i</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>j</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>k</m:mi>
                                                </m:mrow>
                                             </m:msub>
                                             <m:mo>&#8722;</m:mo>
                                             <m:msub>
                                                <m:mi>b</m:mi>
                                                <m:mi>k</m:mi>
                                             </m:msub>
                                             <m:mo stretchy="false">)</m:mo>
                                             <m:mo>,</m:mo>
                                          </m:mrow>
                                       </m:mtd>
                                       <m:mtd columnalign="left">
                                          <m:mrow>
                                             <m:mtext>for&#160;</m:mtext>
                                             <m:msub>
                                                <m:mover accent="true">
                                                   <m:mi>x</m:mi>
                                                   <m:mo>^</m:mo>
                                                </m:mover>
                                                <m:mrow>
                                                   <m:mi>i</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>j</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>k</m:mi>
                                                </m:mrow>
                                             </m:msub>
                                             <m:mo>></m:mo>
                                             <m:msub>
                                                <m:mi>b</m:mi>
                                                <m:mi>k</m:mi>
                                             </m:msub>
                                             <m:mo>,</m:mo>
                                          </m:mrow>
                                       </m:mtd>
                                    </m:mtr>
                                    <m:mtr columnalign="left">
                                       <m:mtd columnalign="left">
                                          <m:mrow>
                                             <m:msub>
                                                <m:mi>a</m:mi>
                                                <m:mi>k</m:mi>
                                             </m:msub>
                                             <m:mo>&#8722;</m:mo>
                                             <m:mi>c</m:mi>
                                             <m:mo stretchy="false">(</m:mo>
                                             <m:msub>
                                                <m:mi>a</m:mi>
                                                <m:mi>k</m:mi>
                                             </m:msub>
                                             <m:mo>&#8722;</m:mo>
                                             <m:msub>
                                                <m:mover accent="true">
                                                   <m:mi>x</m:mi>
                                                   <m:mo>^</m:mo>
                                                </m:mover>
                                                <m:mrow>
                                                   <m:mi>i</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>j</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>k</m:mi>
                                                </m:mrow>
                                             </m:msub>
                                             <m:mo stretchy="false">)</m:mo>
                                             <m:mo>,</m:mo>
                                          </m:mrow>
                                       </m:mtd>
                                       <m:mtd columnalign="left">
                                          <m:mrow>
                                             <m:mtext>for&#160;</m:mtext>
                                             <m:msub>
                                                <m:mover accent="true">
                                                   <m:mi>x</m:mi>
                                                   <m:mo>^</m:mo>
                                                </m:mover>
                                                <m:mrow>
                                                   <m:mi>i</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>j</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>k</m:mi>
                                                </m:mrow>
                                             </m:msub>
                                             <m:mo>&lt;</m:mo>
                                             <m:msub>
                                                <m:mi>a</m:mi>
                                                <m:mi>k</m:mi>
                                             </m:msub>
                                             <m:mo>,</m:mo>
                                          </m:mrow>
                                       </m:mtd>
                                    </m:mtr>
                                    <m:mtr columnalign="left">
                                       <m:mtd columnalign="left">
                                          <m:mrow>
                                             <m:msub>
                                                <m:mover accent="true">
                                                   <m:mi>x</m:mi>
                                                   <m:mo>^</m:mo>
                                                </m:mover>
                                                <m:mrow>
                                                   <m:mi>i</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>j</m:mi>
                                                   <m:mo>,</m:mo>
                                                   <m:mi>k</m:mi>
                                                </m:mrow>
                                             </m:msub>
                                             <m:mo>,</m:mo>
                                          </m:mrow>
                                       </m:mtd>
                                       <m:mtd columnalign="left">
                                          <m:mrow>
                                             <m:mtext>otherwise</m:mtext>
                                             <m:mtext>.</m:mtext>
                                          </m:mrow>
                                       </m:mtd>
                                    </m:mtr>
                                 </m:mtable>
                              </m:mrow>
                           </m:mrow>
                        </m:mrow>
                        <m:annotation encoding="MathType-MTEF">
 MathType@MTEF@5@5@+=feaagaart1ev2aaatCvAUfeBSjuyZL2yd9gzLbvyNv2Caerbhv2BYDwAHbqedmvETj2BSbqee0evGueE0jxyaibaiKI8=vI8GiVeY=Pipec8Eeeu0xXdbba9frFj0xb9Lqpepeea0xd9q8qiYRWxGi6xij=hbbc9s8aq0=yqpe0xbbG8A8frFve9Fve9Fj0dmeaabaqaciGacaGaaeqabaqabeGadaaakeaacaWG5bWaaSbaaSqaaiaadMgacaGGSaGaamOAaiaacYcacaWGRbaabeaakiabg2da9maaceaabaqbaeaabmGaaaqaaiaadkgadaWgaaWcbaGaam4AaaqabaGccqGHRaWkcaWGJbGaaiikaiqadIhagaqcamaaBaaaleaacaWGPbGaaiilaiaadQgacaGGSaGaam4AaaqabaGccqGHsislcaWGIbWaaSbaaSqaaiaadUgaaeqaaOGaaiykaiaacYcaaeaacaqGMbGaae4BaiaabkhacaqGGaGabmiEayaajaWaaSbaaSqaaiaadMgacaGGSaGaamOAaiaacYcacaWGRbaabeaakiabg6da+iaadkgadaWgaaWcbaGaam4AaaqabaGccaGGSaaabaGaamyyamaaBaaaleaacaWGRbaabeaakiabgkHiTiaadogacaGGOaGaamyyamaaBaaaleaacaWGRbaabeaakiabgkHiTiqadIhagaqcamaaBaaaleaacaWGPbGaaiilaiaadQgacaGGSaGaam4AaaqabaGccaGGPaGaaiilaaqaaiaabAgacaqGVbGaaeOCaiaabccaceWG4bGbaKaadaWgaaWcbaGaamyAaiaacYcacaWGQbGaaiilaiaadUgaaeqaaOGaeyipaWJaamyyamaaBaaaleaacaWGRbaabeaakiaacYcaaeaaceWG4bGbaKaadaWgaaWcbaGaamyAaiaacYcacaWGQbGaaiilaiaadUgaaeqaaOGaaiilaaqaaiaab+gacaqG0bGaaeiAaiaabwgacaqGYbGaae4DaiaabMgacaqGZbGaaeyzaiaab6caaaaacaGL7baaaaa@7D06@</m:annotation>
                     </m:semantics>
                  </m:math>
               </display-formula>
            </p>
            <p>The resulting AGC values are now AGCvalue = 2<sup><it>y</it></sup>.</p>
            <p>Some other methods <abbrgrp><abbr bid="B58">58</abbr><abbr bid="B59">59</abbr></abbrgrp> are useful to combine different datasets. However, these are computationally very demanding and probably impractical for datasets comprising almost 10,000 samples. Additionally, the performance of these methods is not validated for integration of multiple datasets.</p>
         </sec>
         <sec>
            <st>
               <p>Sample annotation and manual curation</p>
            </st>
            <p>Annotation of the samples is important to make biological and medical sense of the data. Since not all sources of CEL files come with annotations following the MIAME standards <abbrgrp><abbr bid="B60">60</abbr></abbrgrp>, we performed manual annotation of all the data in the database. Annotation terms linked to each sample were defined by a team of seven biologists and medical doctors. The content of the database in terms of healthy, malignant and other disease samples can be seen in Additional data file 4</p>
         </sec>
         <sec>
            <st>
               <p>Gene annotation</p>
            </st>
            <p>Gene annotation is based on Ensembl. The database has data for each Ensembl gene, even those not featured on any arrays. Gene data include transcript and protein product information, chromosome name and position (band and nucleotide count), biotype (protein coding, miRNA, ribosomal, and so on), and Hugo and Entrez IDs for each gene. These data were downloaded from the Ensembl web site, using the same Ensembl genome build version (release 46) as that used for the construction of the used alternative CDF files <abbrgrp><abbr bid="B20">20</abbr></abbrgrp>.</p>
         </sec>
         <sec>
            <st>
               <p>Multidimensional scaling and clustering accuracy</p>
            </st>
            <p>We utilized classic MDS in order to diminish the number of the dimensions within the data <abbrgrp><abbr bid="B24">24</abbr></abbrgrp>. With MDS, 1,137 samples with 7,390 dimensions (that is, genes) were brought to low-dimensional space so that the distance between each sample pair with these new dimensions is very close to the distance between the original values of the samples. As a distance metric, we used Manhattan distance.</p>
         </sec>
         <sec>
            <st>
               <p>K-means clustering and rand index analysis</p>
            </st>
            <p>K-means clustering was performed with default parameters in R. The initial centroids were given as the median value of each gene in array generations or tissues. The algorithm was allowed to run for a maximum of 100,000 iterations for each clustering. The corrected rand index <abbrgrp><abbr bid="B23">23</abbr></abbrgrp> was calculated in R with fpc library.</p>
         </sec>
         <sec>
            <st>
               <p>Replicate analysis</p>
            </st>
            <p>Replicate analysis was performed by comparing the correlation coefficients of the logarithmic values of two or three hybridizations from a single biological sample using standard methods of computing the Pearson correlation coefficient. This was done for all samples described in <abbrgrp><abbr bid="B9">9</abbr><abbr bid="B26">26</abbr><abbr bid="B27">27</abbr></abbrgrp>.</p>
         </sec>
         <sec>
            <st>
               <p>Body-wide expression profiles of genes</p>
            </st>
            <p>We visualize the expression profile of a single gene across all human tissues with boxplots and with custom designed body-wide expression plots. In the boxplots, the expression profiles of a single gene are displayed and grouped into healthy samples (green boxes) and malignant samples (red boxes). Both types are in anatomically meaningful order, allowing easy comparison of related tissue types. Numbers of samples in each tissue type are in parentheses.</p>
            <p>Custom designed body-wide expression profiles show the expression pattern of a single gene at the level of individual samples, while its layout allows easy analysis of the biological or medical significance of the profile. The y-axis provides the expression level of the gene and the x-axis contains all samples arranged into a fixed order by the type of the sample (healthy, malignant) and subsequently by the tissue type. Thus, each dot describes the expression level of a particular gene in one sample. The anatomical origin of each sample can be seen from the color bar at the bottom of the image. Tissues expressing the gene at a high level (more than one standard deviation higher than the baseline for that gene or having a group of outlier data points) are colored.</p>
         </sec>
         <sec>
            <st>
               <p>Body-wide gene expression heatmaps for human cancer genes</p>
            </st>
            <p>Bodywide expression maps of genes are done with hierarchical clustering (Euclidean distance with Ward linkage) of mean expression profile for 342 genes across 110 <it>in vivo </it>tissues. The number of samples per tissue type is given in parentheses. Values for each gene are mean-centered at 0 with a standard deviation of 1.</p>
         </sec>
         <sec>
            <st>
               <p>Availability of data</p>
            </st>
            <p>As the <it>in silico </it>transcriptomics data of this project are composed of custom integration of already public microarray data we provide a table describing the origins of the data used to construct GeneSapiens (Additional data file 3). We have set up a website <abbrgrp><abbr bid="B16">16</abbr></abbrgrp> to allow browsing of expression profiles of these genes and associated information as well as generation of correlations/scatterplots between any pairs of genes across any tissues.</p>
         </sec>
      </sec>
      <sec>
         <st>
            <p>Abbreviations</p>
         </st>
         <p>AGC: array-generation-based gene centering; GIST: gastrointestinal stromal tumor; MDS: multi-dimensional scaling; PLAP: placental alkaline phosphatase; Q: quantile; QAGC: Q normalized data to which AGC correction has been applied.</p>
      </sec>
      <sec>
         <st>
            <p>Competing interests</p>
         </st>
         <p>The institute has filed a patent application regarding the normalization methodology.</p>
      </sec>
      <sec>
         <st>
            <p>Authors' contributions</p>
         </st>
         <p>SK contributed to the majority of data analysis, database construction and development of normalization and writing of the manuscript. RA and MS contributed to the development and testing of the normalization. KO and EB contributed to data collection and the annotation process. KO also contributed to data mining methods and checking of all annotations. KI had a major contribution to the annotation. SH contributed to the development of normalization and supervised the comparison and validation of the normalization methods. OK supervised the entire project for database construction, data mining and annotation efforts and participated in manuscript writing and editing. The remaining authors contributed towards annotation, data visualization and other methods as well as editing the manuscript.</p>
      </sec>
      <sec>
         <st>
            <p>Additional data files</p>
         </st>
         <p>The following additional data are available with the online version of this paper. Additional data file <supplr sid="S1">1</supplr> shows the distribution of preprocessed datapoints across the entire database (solid line) and normal distribution (<it>N</it>(<it>8, 4</it>)) estimated from it (dashed line). Additional data file <supplr sid="S2">2</supplr> shows boxplots of various known tissue-specific genes. Additional data file <supplr sid="S3">3</supplr> lists the sources for all the raw expression data files used in this study. Additional data file <supplr sid="S4">4</supplr> lists the various healthy tissues, cancers and non-cancer diseases represented by the samples in the database and the amounts of samples in each of these categories. Additional data file <supplr sid="S5">5</supplr> lists rand indices for the different normalizations, and the distribution of array generations and tissues into clusters with Q and QAGC (Q normalized data to which AGC correction has been applied) normalized data. Additional data file <supplr sid="S6">6</supplr> shows boxplots of various known tissue-specific genes. Additional data file <supplr sid="S7">7</supplr> shows that <it>FEV </it>has clearly elevated expression in several malignancies, when compared to any healthy tissue. Most interestingly this ETS-factor family member appears to have slightly elevated expression in prostate cancer when compared to healthy prostate. Additional data file <supplr sid="S8">8</supplr> shows that expression of the <it>C1orf56 </it>gene, also known as <it>AF1Q </it>or <it>MLLT11</it>, shows extreme expression in several cancers, especially in neuroblastoma.</p>
         <suppl id="S1">
            <title>
               <p>Additional data file 1</p>
            </title>
            <caption>
               <p>Distribution of preprocessed datapoints across the entire database and normal distribution (<it>N</it>(<it>8, 4</it>)) estimated from it</p>
            </caption>
            <text>
               <p>Distribution of preprocessed datapoints across the entire database (solid line) and normal distribution (<it>N</it>(<it>8, 4</it>)) estimated from it (dashed line).</p>
            </text>
            <file name="gb-2008-9-9-r139-S1.png">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S2">
            <title>
               <p>Additional data file 2</p>
            </title>
            <caption>
               <p>Boxplots of various known tissue-specific genes</p>
            </caption>
            <text>
               <p><b>(a) </b><it>KLK3 </it>(PSA) is a known prostate specific gene. This specificity is perfectly shown in its expression profile. <b>(b) </b><it>GFAP</it>, a gene coding for glial fibrillary acidic protein, is known to be expressed in central nervous system. Its expression profile perfectly confirms this prior knowledge.</p>
            </text>
            <file name="gb-2008-9-9-r139-S2.png">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S3">
            <title>
               <p>Additional data file 3</p>
            </title>
            <caption>
               <p>Sources for all the raw expression data files used in this study</p>
            </caption>
            <text>
               <p>Sources for all the raw expression data files used in this study.</p>
            </text>
            <file name="gb-2008-9-9-r139-S3.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S4">
            <title>
               <p>Additional data file 4</p>
            </title>
            <caption>
               <p>Various healthy tissues, cancers and non-cancer diseases represented by the samples in the database and the amounts of samples in each of these categories</p>
            </caption>
            <text>
               <p>Various healthy tissues, cancers and non-cancer diseases represented by the samples in the database and the amounts of samples in each of these categories.</p>
            </text>
            <file name="gb-2008-9-9-r139-S4.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S5">
            <title>
               <p>Additional data file 5</p>
            </title>
            <caption>
               <p>Rand indices for the different normalizations, and the distribution of array generations and tissues into clusters with Q and QAGC normalized data</p>
            </caption>
            <text>
               <p>Rand indices for the different normalizations, and the distribution of array generations and tissues into clusters with Q and QAGC normalized data.</p>
            </text>
            <file name="gb-2008-9-9-r139-S5.xls">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S6">
            <title>
               <p>Additional data file 6</p>
            </title>
            <caption>
               <p>Boxplots of various known tissue-specific genes</p>
            </caption>
            <text>
               <p><b>(a) </b>Insulin (INS) has pancreas specific expression, as one expects it to have. <b>(b) </b><it>LDHC </it>is a known testis-specific gene and it is expressed above background only in healthy testis.</p>
            </text>
            <file name="gb-2008-9-9-r139-S6.png">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S7">
            <title>
               <p>Additional data file 7</p>
            </title>
            <caption>
               <p><it>FEV </it>has clearly elevated expression in several malignancies, when compared to any healthy tissue</p>
            </caption>
            <text>
               <p>Most interestingly this ETS-factor family member appears to have slightly elevated expression in prostate cancer when compared to healthy prostate.</p>
            </text>
            <file name="gb-2008-9-9-r139-S7.eps">
               <p>Click here for file</p>
            </file>
         </suppl>
         <suppl id="S8">
            <title>
               <p>Additional data file 8</p>
            </title>
            <caption>
               <p>Expression of the <it>C1orf56 </it>gene shows extreme expression in several cancers, especially in neuroblastoma</p>
            </caption>
            <text>
               <p>Expression of the <it>C1orf56 </it>gene, also known as <it>AF1Q </it>or <it>MLLT11</it>, shows extreme expression in several cancers, especially in neuroblastoma.</p>
            </text>
            <file name="gb-2008-9-9-r139-S8.eps">
               <p>Click here for file</p>
            </file>
         </suppl>
      </sec>
   </bdy>
   <bm>
      <ack>
         <sec>
            <st>
               <p>Acknowledgements</p>
            </st>
            <p>The authors would like to acknowledge all annotators of the database as well as Dr Outi Monni for providing facilities in the Biomedicum Biochip Center. This study was supported by the Marie Curie Canceromics (MEXT-CT-2003-2728) grant from the EU, EU-EPITRON (LSHC-CT-2005-518417), Cancer Organizations of Finland, Sigrid Juselius Foundation, Turku TE-Centre, and Academy of Finland (SysBio research program no. 5207532 and Centres of Excellence funding no. 213502) as well as personal grants from the Emil Aaltonen Foundation, the Foundation of Technology, the Finnish Konkordia Fund and the Foundation for Commercial and Technical Sciences (to RA). The authors would like to thank Kristine Kleivi, Sirkku Pollari, Juha Rantala, Santosh Gupta and Kimmo Jaakkola for their help in annotation of microarray data.</p>
         </sec>
      </ack>
      <refgrp>
         <bibl id="B1">
            <title>
               <p>Coexpression analysis of human genes across many microarray datasets.</p>
            </title>
            <aug>
               <au>
                  <snm>Lee</snm>
                  <fnm>HK</fnm>
               </au>
               <au>
                  <snm>Hsu</snm>
                  <fnm>AK</fnm>
               </au>
               <au>
                  <snm>Sajdak</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Qin</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Pavlidis</snm>
                  <fnm>P</fnm>
               </au>
            </aug>
            <source>Genome Res</source>
            <pubdate>2004</pubdate>
            <volume>14</volume>
            <fpage>1085</fpage>
            <lpage>1094</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">419787</pubid>
                  <pubid idtype="pmpid" link="fulltext">15173114</pubid>
                  <pubid idtype="doi">10.1101/gr.1910904</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B2">
            <title>
               <p>ONCOMINE: a cancer microarray database and integrated data-mining platform.</p>
            </title>
            <aug>
               <au>
                  <snm>Rhodes</snm>
                  <fnm>DR</fnm>
               </au>
               <au>
                  <snm>Yu</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Shanker</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Deshpande</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Varambally</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Ghosh</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Barrette</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Pandey</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Chinnaiyan</snm>
                  <fnm>AM</fnm>
               </au>
            </aug>
            <source>Neoplasia</source>
            <pubdate>2004</pubdate>
            <volume>6</volume>
            <fpage>1</fpage>
            <lpage>6</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1635162</pubid>
                  <pubid idtype="pmpid" link="fulltext">15068665</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B3">
            <title>
               <p>Module networks: identifying regulatory modules and their condition-specific regulators from gene expression data.</p>
            </title>
            <aug>
               <au>
                  <snm>Segal</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Shapira</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Regev</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Pe'er</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Botstein</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Koller</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Friedman</snm>
                  <fnm>N</fnm>
               </au>
            </aug>
            <source>Nat Genet</source>
            <pubdate>2003</pubdate>
            <volume>34</volume>
            <fpage>166</fpage>
            <lpage>176</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/ng1165</pubid>
                  <pubid idtype="pmpid" link="fulltext">12740579</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B4">
            <title>
               <p>Integrating probe-level expression changes across generations of Affymetrix arrays.</p>
            </title>
            <aug>
               <au>
                  <snm>Elo</snm>
                  <fnm>LL</fnm>
               </au>
               <au>
                  <snm>Lahti</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Skottman</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Kylaniemi</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Lahesmaa</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Aittokallio</snm>
                  <fnm>T</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2005</pubdate>
            <volume>33</volume>
            <fpage>e193</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1316121</pubid>
                  <pubid idtype="pmpid" link="fulltext">16356924</pubid>
                  <pubid idtype="doi">10.1093/nar/gni193</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B5">
            <title>
               <p>Multiple-laboratory comparison of microarray platforms[see comment][erratum appears in Nat Methods. 2005 Jun;2(6):477].</p>
            </title>
            <aug>
               <au>
                  <snm>Irizarry</snm>
                  <fnm>RA</fnm>
               </au>
               <au>
                  <snm>Warren</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Spencer</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Kim</snm>
                  <fnm>IF</fnm>
               </au>
               <au>
                  <snm>Biswal</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Frank</snm>
                  <fnm>BC</fnm>
               </au>
               <au>
                  <snm>Gabrielson</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Garcia</snm>
                  <fnm>JG</fnm>
               </au>
               <au>
                  <snm>Geoghegan</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Germino</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Griffin</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Hilmer</snm>
                  <fnm>SC</fnm>
               </au>
               <au>
                  <snm>Hoffman</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Jedlicka</snm>
                  <fnm>AE</fnm>
               </au>
               <au>
                  <snm>Kawasaki</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Martinez-Murillo</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Morsberger</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Lee</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Petersen</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Quackenbush</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Scott</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Wilson</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Yang</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Ye</snm>
                  <fnm>SQ</fnm>
               </au>
               <au>
                  <snm>Yu</snm>
                  <fnm>W</fnm>
               </au>
            </aug>
            <source>Nat Methods</source>
            <pubdate>2005</pubdate>
            <volume>2</volume>
            <fpage>345</fpage>
            <lpage>350</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nmeth756</pubid>
                  <pubid idtype="pmpid" link="fulltext">15846361</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B6">
            <title>
               <p>Are data from different gene expression microarray platforms comparable?</p>
            </title>
            <aug>
               <au>
                  <snm>Jarvinen</snm>
                  <fnm>AK</fnm>
               </au>
               <au>
                  <snm>Hautaniemi</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Edgren</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Auvinen</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Saarela</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Kallioniemi</snm>
                  <fnm>OP</fnm>
               </au>
               <au>
                  <snm>Monni</snm>
                  <fnm>O</fnm>
               </au>
            </aug>
            <source>Genomics</source>
            <pubdate>2004</pubdate>
            <volume>83</volume>
            <fpage>1164</fpage>
            <lpage>1168</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1016/j.ygeno.2004.01.004</pubid>
                  <pubid idtype="pmpid" link="fulltext">15177569</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B7">
            <title>
               <p>Independence and reproducibility across microarray platforms[see comment].</p>
            </title>
            <aug>
               <au>
                  <snm>Larkin</snm>
                  <fnm>JE</fnm>
               </au>
               <au>
                  <snm>Frank</snm>
                  <fnm>BC</fnm>
               </au>
               <au>
                  <snm>Gavras</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Sultana</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Quackenbush</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>Nat Methods</source>
            <pubdate>2005</pubdate>
            <volume>2</volume>
            <fpage>337</fpage>
            <lpage>344</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/nmeth757</pubid>
                  <pubid idtype="pmpid" link="fulltext">15846360</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B8">
            <title>
               <p>Getting the noise out of gene arrays.</p>
            </title>
            <aug>
               <au>
                  <snm>Marshall</snm>
                  <fnm>E</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2004</pubdate>
            <volume>306</volume>
            <fpage>630</fpage>
            <lpage>631</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.306.5696.630</pubid>
                  <pubid idtype="pmpid" link="fulltext">15499004</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B9">
            <title>
               <p>Combining gene expression data from different generations of oligonucleotide arrays.</p>
            </title>
            <aug>
               <au>
                  <snm>Hwang</snm>
                  <fnm>KB</fnm>
               </au>
               <au>
                  <snm>Kong</snm>
                  <fnm>SW</fnm>
               </au>
               <au>
                  <snm>Greenberg</snm>
                  <fnm>SA</fnm>
               </au>
               <au>
                  <snm>Park</snm>
                  <fnm>PJ</fnm>
               </au>
            </aug>
            <source>BMC Bioinformatics</source>
            <pubdate>2004</pubdate>
            <volume>5</volume>
            <fpage>159</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">528726</pubid>
                  <pubid idtype="pmpid" link="fulltext">15504239</pubid>
                  <pubid idtype="doi">10.1186/1471-2105-5-159</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B10">
            <title>
               <p>Cluster analysis and display of genome-wide expression patterns.</p>
            </title>
            <aug>
               <au>
                  <snm>Eisen</snm>
                  <fnm>MB</fnm>
               </au>
               <au>
                  <snm>Spellman</snm>
                  <fnm>PT</fnm>
               </au>
               <au>
                  <snm>Brown</snm>
                  <fnm>PO</fnm>
               </au>
               <au>
                  <snm>Botstein</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>Proc Natl Acad Sci USA</source>
            <pubdate>1998</pubdate>
            <volume>95</volume>
            <fpage>14863</fpage>
            <lpage>14868</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">24541</pubid>
                  <pubid idtype="pmpid" link="fulltext">9843981</pubid>
                  <pubid idtype="doi">10.1073/pnas.95.25.14863</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B11">
            <title>
               <p>Synexpression groups in eukaryotes.</p>
            </title>
            <aug>
               <au>
                  <snm>Niehrs</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Pollet</snm>
                  <fnm>N</fnm>
               </au>
            </aug>
            <source>Nature</source>
            <pubdate>1999</pubdate>
            <volume>402</volume>
            <fpage>483</fpage>
            <lpage>487</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/990025</pubid>
                  <pubid idtype="pmpid" link="fulltext">10591207</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B12">
            <title>
               <p>Mining for regulatory programs in the cancer transcriptome.</p>
            </title>
            <aug>
               <au>
                  <snm>Rhodes</snm>
                  <fnm>DR</fnm>
               </au>
               <au>
                  <snm>Kalyana-Sundaram</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Mahavisno</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Barrette</snm>
                  <fnm>TR</fnm>
               </au>
               <au>
                  <snm>Ghosh</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Chinnaiyan</snm>
                  <fnm>AM</fnm>
               </au>
            </aug>
            <source>Nat Genet</source>
            <pubdate>2005</pubdate>
            <volume>37</volume>
            <fpage>579</fpage>
            <lpage>583</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/ng1578</pubid>
                  <pubid idtype="pmpid" link="fulltext">15920519</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B13">
            <title>
               <p>Genome-wide discovery of transcriptional modules from DNA sequence and gene expression.</p>
            </title>
            <aug>
               <au>
                  <snm>Segal</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Yelensky</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Koller</snm>
                  <fnm>D</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2003</pubdate>
            <volume>19</volume>
            <issue>Suppl 1</issue>
            <fpage>i273</fpage>
            <lpage>282</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/btg1038</pubid>
                  <pubid idtype="pmpid" link="fulltext">12855470</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B14">
            <title>
               <p>A module map showing conditional activity of expression modules in cancer.</p>
            </title>
            <aug>
               <au>
                  <snm>Segal</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Friedman</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Koller</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Regev</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Nat Genet</source>
            <pubdate>2004</pubdate>
            <volume>36</volume>
            <fpage>1090</fpage>
            <lpage>1098</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/ng1434</pubid>
                  <pubid idtype="pmpid" link="fulltext">15448693</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B15">
            <title>
               <p>The functional landscape of mouse gene expression.</p>
            </title>
            <aug>
               <au>
                  <snm>Zhang</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Morris</snm>
                  <fnm>QD</fnm>
               </au>
               <au>
                  <snm>Chang</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Shai</snm>
                  <fnm>O</fnm>
               </au>
               <au>
                  <snm>Bakowski</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Mitsakakis</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Mohammad</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Robinson</snm>
                  <fnm>MD</fnm>
               </au>
               <au>
                  <snm>Zirngibl</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Somogyi</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Laurin</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Eftekharpour</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Sat</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Grigull</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Pan</snm>
                  <fnm>Q</fnm>
               </au>
               <au>
                  <snm>Peng</snm>
                  <fnm>WT</fnm>
               </au>
               <au>
                  <snm>Krogan</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Greenblatt</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Fehlings</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kooy</snm>
                  <mnm>van der</mnm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Aubin</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Bruneau</snm>
                  <fnm>BG</fnm>
               </au>
               <au>
                  <snm>Rossant</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Blencowe</snm>
                  <fnm>BJ</fnm>
               </au>
               <au>
                  <snm>Frey</snm>
                  <fnm>BJ</fnm>
               </au>
               <au>
                  <snm>Hughes</snm>
                  <fnm>TR</fnm>
               </au>
            </aug>
            <source>J Biol</source>
            <pubdate>2004</pubdate>
            <volume>3</volume>
            <fpage>21</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">549719</pubid>
                  <pubid idtype="pmpid" link="fulltext">15588312</pubid>
                  <pubid idtype="doi">10.1186/jbiol16</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B16">
            <title>
               <p>GeneSapiens</p>
            </title>
            <url>http://www.genesapiens.org</url>
         </bibl>
         <bibl id="B17">
            <aug>
               <au>
                  <cnm>R Development Core Team</cnm>
               </au>
            </aug>
            <source>R: A Language and Environment for Statistical Computing</source>
            <publisher>Vienna, Austria: R Foundation for Statistical Computing</publisher>
            <pubdate>2007</pubdate>
         </bibl>
         <bibl id="B18">
            <title>
               <p>Statistical Algorithms Description Document</p>
            </title>
            <url>http://www.affymetrix.com/support/technical/whitepapers/</url>
         </bibl>
         <bibl id="B19">
            <title>
               <p>The utility of MAS5 expression summary and detection call algorithms.</p>
            </title>
            <aug>
               <au>
                  <snm>Pepper</snm>
                  <fnm>SD</fnm>
               </au>
               <au>
                  <snm>Saunders</snm>
                  <fnm>EK</fnm>
               </au>
               <au>
                  <snm>Edwards</snm>
                  <fnm>LE</fnm>
               </au>
               <au>
                  <snm>Wilson</snm>
                  <fnm>CL</fnm>
               </au>
               <au>
                  <snm>Miller</snm>
                  <fnm>CJ</fnm>
               </au>
            </aug>
            <source>BMC Bioinformatics</source>
            <pubdate>2007</pubdate>
            <volume>8</volume>
            <fpage>273</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1950098</pubid>
                  <pubid idtype="pmpid" link="fulltext">17663764</pubid>
                  <pubid idtype="doi">10.1186/1471-2105-8-273</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B20">
            <title>
               <p>Evolving gene/transcript definitions significantly alter the interpretation of GeneChip data.</p>
            </title>
            <aug>
               <au>
                  <snm>Dai</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Wang</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Boyd</snm>
                  <fnm>AD</fnm>
               </au>
               <au>
                  <snm>Kostov</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Athey</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Jones</snm>
                  <fnm>EG</fnm>
               </au>
               <au>
                  <snm>Bunney</snm>
                  <fnm>WE</fnm>
               </au>
               <au>
                  <snm>Myers</snm>
                  <fnm>RM</fnm>
               </au>
               <au>
                  <snm>Speed</snm>
                  <fnm>TP</fnm>
               </au>
               <au>
                  <snm>Akil</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Watson</snm>
                  <fnm>SJ</fnm>
               </au>
               <au>
                  <snm>Meng</snm>
                  <fnm>F</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2005</pubdate>
            <volume>33</volume>
            <fpage>e175</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1283542</pubid>
                  <pubid idtype="pmpid" link="fulltext">16284200</pubid>
                  <pubid idtype="doi">10.1093/nar/gni179</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B21">
            <title>
               <p>Effects of Herceptin treatment on global gene expression patterns in HER2-amplified and nonamplified breast cancer cell lines.</p>
            </title>
            <aug>
               <au>
                  <snm>Kauraniemi</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Hautaniemi</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Autio</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Astola</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Monni</snm>
                  <fnm>O</fnm>
               </au>
               <au>
                  <snm>Elkahloun</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Kallioniemi</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Oncogene</source>
            <pubdate>2004</pubdate>
            <volume>23</volume>
            <fpage>1010</fpage>
            <lpage>1013</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/sj.onc.1207200</pubid>
                  <pubid idtype="pmpid" link="fulltext">14647448</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B22">
            <title>
               <p>A comparison of normalization methods for high density oligonucleotide array data based on variance and bias.</p>
            </title>
            <aug>
               <au>
                  <snm>Bolstad</snm>
                  <fnm>BM</fnm>
               </au>
               <au>
                  <snm>Irizarry</snm>
                  <fnm>RA</fnm>
               </au>
               <au>
                  <snm>Astrand</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Speed</snm>
                  <fnm>TP</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2003</pubdate>
            <volume>19</volume>
            <fpage>185</fpage>
            <lpage>193</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/19.2.185</pubid>
                  <pubid idtype="pmpid" link="fulltext">12538238</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B23">
            <title>
               <p>Comparing partitions.</p>
            </title>
            <aug>
               <au>
                  <snm>Hubert Lawrence</snm>
                  <fnm>AP</fnm>
               </au>
            </aug>
            <source>J Classification</source>
            <pubdate>1985</pubdate>
            <fpage>193</fpage>
            <lpage>218</lpage>
            <xrefbib>
               <pubid idtype="doi">10.1007/BF01908075</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B24">
            <title>
               <p>Gene expression profiling of alveolar rhabdomyosarcoma with cDNA microarrays.</p>
            </title>
            <aug>
               <au>
                  <snm>Khan</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Simon</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Bittner</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Chen</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Leighton</snm>
                  <fnm>SB</fnm>
               </au>
               <au>
                  <snm>Pohida</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Smith</snm>
                  <fnm>PD</fnm>
               </au>
               <au>
                  <snm>Jiang</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Gooden</snm>
                  <fnm>GC</fnm>
               </au>
               <au>
                  <snm>Trent</snm>
                  <fnm>JM</fnm>
               </au>
               <au>
                  <snm>Meltzer</snm>
                  <fnm>PS</fnm>
               </au>
            </aug>
            <source>Cancer Res</source>
            <pubdate>1998</pubdate>
            <volume>58</volume>
            <fpage>5009</fpage>
            <lpage>5013</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">9823299</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B25">
            <title>
               <p>Transformation of expression intensities across generations of Affymetrix microarrays using sequence matching and regression modeling.</p>
            </title>
            <aug>
               <au>
                  <snm>Bhattacharya</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Mariani</snm>
                  <fnm>TJ</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2005</pubdate>
            <volume>33</volume>
            <fpage>e157</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1258179</pubid>
                  <pubid idtype="pmpid" link="fulltext">16224098</pubid>
                  <pubid idtype="doi">10.1093/nar/gni159</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B26">
            <title>
               <p>Classification of pediatric acute lymphoblastic leukemia by gene expression profiling.</p>
            </title>
            <aug>
               <au>
                  <snm>Ross</snm>
                  <fnm>ME</fnm>
               </au>
               <au>
                  <snm>Zhou</snm>
                  <fnm>X</fnm>
               </au>
               <au>
                  <snm>Song</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Shurtleff</snm>
                  <fnm>SA</fnm>
               </au>
               <au>
                  <snm>Girtman</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Williams</snm>
                  <fnm>WK</fnm>
               </au>
               <au>
                  <snm>Liu</snm>
                  <fnm>HC</fnm>
               </au>
               <au>
                  <snm>Mahfouz</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Raimondi</snm>
                  <fnm>SC</fnm>
               </au>
               <au>
                  <snm>Lenny</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Patel</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Downing</snm>
                  <fnm>JR</fnm>
               </au>
            </aug>
            <source>Blood</source>
            <pubdate>2003</pubdate>
            <volume>102</volume>
            <fpage>2951</fpage>
            <lpage>2959</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1182/blood-2003-01-0338</pubid>
                  <pubid idtype="pmpid" link="fulltext">12730115</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B27">
            <title>
               <p>Classification, subtype discovery, and prediction of outcome in pediatric acute lymphoblastic leukemia by gene expression profiling.</p>
            </title>
            <aug>
               <au>
                  <snm>Yeoh</snm>
                  <fnm>EJ</fnm>
               </au>
               <au>
                  <snm>Ross</snm>
                  <fnm>ME</fnm>
               </au>
               <au>
                  <snm>Shurtleff</snm>
                  <fnm>SA</fnm>
               </au>
               <au>
                  <snm>Williams</snm>
                  <fnm>WK</fnm>
               </au>
               <au>
                  <snm>Patel</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Mahfouz</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Behm</snm>
                  <fnm>FG</fnm>
               </au>
               <au>
                  <snm>Raimondi</snm>
                  <fnm>SC</fnm>
               </au>
               <au>
                  <snm>Relling</snm>
                  <fnm>MV</fnm>
               </au>
               <au>
                  <snm>Patel</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Cheng</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Campana</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Wilkins</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Zhou</snm>
                  <fnm>X</fnm>
               </au>
               <au>
                  <snm>Li</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Liu</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Pui</snm>
                  <fnm>CH</fnm>
               </au>
               <au>
                  <snm>Evans</snm>
                  <fnm>WE</fnm>
               </au>
               <au>
                  <snm>Naeve</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Wong</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Downing</snm>
                  <fnm>JR</fnm>
               </au>
            </aug>
            <source>Cancer Cell</source>
            <pubdate>2002</pubdate>
            <volume>1</volume>
            <fpage>133</fpage>
            <lpage>143</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1016/S1535-6108(02)00032-6</pubid>
                  <pubid idtype="pmpid" link="fulltext">12086872</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B28">
            <title>
               <p>Cardiac troponin T and cardiac troponin I: relative values in short-term risk stratification of patients with acute coronary syndromes. GUSTO-IIa Investigators.</p>
            </title>
            <aug>
               <au>
                  <snm>Christenson</snm>
                  <fnm>RH</fnm>
               </au>
               <au>
                  <snm>Duh</snm>
                  <fnm>SH</fnm>
               </au>
               <au>
                  <snm>Newby</snm>
                  <fnm>LK</fnm>
               </au>
               <au>
                  <snm>Ohman</snm>
                  <fnm>EM</fnm>
               </au>
               <au>
                  <snm>Califf</snm>
                  <fnm>RM</fnm>
               </au>
               <au>
                  <snm>Granger</snm>
                  <fnm>CB</fnm>
               </au>
               <au>
                  <snm>Peck</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Pieper</snm>
                  <fnm>KS</fnm>
               </au>
               <au>
                  <snm>Armstrong</snm>
                  <fnm>PW</fnm>
               </au>
               <au>
                  <snm>Katus</snm>
                  <fnm>HA</fnm>
               </au>
               <au>
                  <snm>Topol</snm>
                  <fnm>EJ</fnm>
               </au>
            </aug>
            <source>Clin Chem</source>
            <pubdate>1998</pubdate>
            <volume>44</volume>
            <fpage>494</fpage>
            <lpage>501</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">9510853</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B29">
            <title>
               <p>Metastatic alveolar rhabdomyosarcoma with increased serum creatine kinase MB and cardiac troponin T and normal cardiac troponin I.</p>
            </title>
            <aug>
               <au>
                  <snm>Isotalo</snm>
                  <fnm>PA</fnm>
               </au>
               <au>
                  <snm>Greenway</snm>
                  <fnm>DC</fnm>
               </au>
               <au>
                  <snm>Donnelly</snm>
                  <fnm>JG</fnm>
               </au>
            </aug>
            <source>Clin Chem</source>
            <pubdate>1999</pubdate>
            <volume>45</volume>
            <fpage>1576</fpage>
            <lpage>1578</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">10471669</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B30">
            <title>
               <p>Differential gene expression in the amnion, chorion, and trophoblast of the human placenta.</p>
            </title>
            <aug>
               <au>
                  <snm>Plouzek</snm>
                  <fnm>CA</fnm>
               </au>
               <au>
                  <snm>Leslie</snm>
                  <fnm>KK</fnm>
               </au>
               <au>
                  <snm>Stephens</snm>
                  <fnm>JK</fnm>
               </au>
               <au>
                  <snm>Chou</snm>
                  <fnm>JY</fnm>
               </au>
            </aug>
            <source>Placenta</source>
            <pubdate>1993</pubdate>
            <volume>14</volume>
            <fpage>277</fpage>
            <lpage>285</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1016/S0143-4004(05)80427-8</pubid>
                  <pubid idtype="pmpid">8367411</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B31">
            <title>
               <p>Placenta-like alkaline phosphatase in gynecological cancers.</p>
            </title>
            <aug>
               <au>
                  <snm>Kellen</snm>
                  <fnm>JA</fnm>
               </au>
               <au>
                  <snm>Bush</snm>
                  <fnm>RS</fnm>
               </au>
               <au>
                  <snm>Malkin</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Cancer Res</source>
            <pubdate>1976</pubdate>
            <volume>36</volume>
            <fpage>269</fpage>
            <lpage>271</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">1248005</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B32">
            <title>
               <p>Serum placental-type alkaline phosphatase activity in women with squamous and glandular malignancies of the reproductive tract.</p>
            </title>
            <aug>
               <au>
                  <snm>Ind</snm>
                  <fnm>TE</fnm>
               </au>
               <au>
                  <snm>Iles</snm>
                  <fnm>RK</fnm>
               </au>
               <au>
                  <snm>Carter</snm>
                  <fnm>PG</fnm>
               </au>
               <au>
                  <snm>Lowe</snm>
                  <fnm>DG</fnm>
               </au>
               <au>
                  <snm>Shepherd</snm>
                  <fnm>JH</fnm>
               </au>
               <au>
                  <snm>Hudson</snm>
                  <fnm>CN</fnm>
               </au>
               <au>
                  <snm>Chard</snm>
                  <fnm>T</fnm>
               </au>
            </aug>
            <source>J Clin Pathol</source>
            <pubdate>1994</pubdate>
            <volume>47</volume>
            <fpage>1035</fpage>
            <lpage>1037</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">503069</pubid>
                  <pubid idtype="pmpid" link="fulltext">7829680</pubid>
                  <pubid idtype="doi">10.1136/jcp.47.11.1035</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B33">
            <title>
               <p>Myelin-associated glycoprotein immunoreactive material: an early neuronal marker of dorsal root ganglion cells during chick development.</p>
            </title>
            <aug>
               <au>
                  <snm>Philippe</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Omlin</snm>
                  <fnm>FX</fnm>
               </au>
               <au>
                  <snm>Droz</snm>
                  <fnm>B</fnm>
               </au>
            </aug>
            <source>Brain Res</source>
            <pubdate>1986</pubdate>
            <volume>392</volume>
            <fpage>275</fpage>
            <lpage>277</lpage>
            <xrefbib>
               <pubid idtype="pmpid">2423198</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B34">
            <title>
               <p>Distribution of 15 human kallikreins in tissues and biological fluids.</p>
            </title>
            <aug>
               <au>
                  <snm>Shaw</snm>
                  <fnm>JL</fnm>
               </au>
               <au>
                  <snm>Diamandis</snm>
                  <fnm>EP</fnm>
               </au>
            </aug>
            <source>Clin Chem</source>
            <pubdate>2007</pubdate>
            <volume>53</volume>
            <fpage>1423</fpage>
            <lpage>1432</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1373/clinchem.2007.088104</pubid>
                  <pubid idtype="pmpid" link="fulltext">17573418</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B35">
            <title>
               <p>GFAP promoter directs astrocyte-specific expression in transgenic mice.</p>
            </title>
            <aug>
               <au>
                  <snm>Brenner</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kisseberth</snm>
                  <fnm>WC</fnm>
               </au>
               <au>
                  <snm>Su</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Besnard</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Messing</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>J Neurosci</source>
            <pubdate>1994</pubdate>
            <volume>14</volume>
            <fpage>1030</fpage>
            <lpage>1037</lpage>
            <xrefbib>
               <pubid idtype="pmpid" link="fulltext">8120611</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B36">
            <title>
               <p>Cancer/testis antigens and gametogenesis: a review and "brain-storming" session.</p>
            </title>
            <aug>
               <au>
                  <snm>Kalejs</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Erenpreisa</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>Cancer Cell Int</source>
            <pubdate>2005</pubdate>
            <volume>5</volume>
            <fpage>4</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">552320</pubid>
                  <pubid idtype="pmpid" link="fulltext">15715909</pubid>
                  <pubid idtype="doi">10.1186/1475-2867-5-4</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B37">
            <title>
               <p>The human tumor antigen PRAME is a dominant repressor of retinoic acid receptor signaling.</p>
            </title>
            <aug>
               <au>
                  <snm>Epping</snm>
                  <fnm>MT</fnm>
               </au>
               <au>
                  <snm>Wang</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Edel</snm>
                  <fnm>MJ</fnm>
               </au>
               <au>
                  <snm>Carlee</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Hernandez</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Bernards</snm>
                  <fnm>R</fnm>
               </au>
            </aug>
            <source>Cell</source>
            <pubdate>2005</pubdate>
            <volume>122</volume>
            <fpage>835</fpage>
            <lpage>847</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1016/j.cell.2005.07.003</pubid>
                  <pubid idtype="pmpid" link="fulltext">16179254</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B38">
            <title>
               <p>The cell proliferation-associated antigen of antibody Ki-67: a very large, ubiquitous nuclear protein with numerous repeated elements, representing a new kind of cell cycle-maintaining proteins.</p>
            </title>
            <aug>
               <au>
                  <snm>Schluter</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Duchrow</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Wohlenberg</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Becker</snm>
                  <fnm>MH</fnm>
               </au>
               <au>
                  <snm>Key</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Flad</snm>
                  <fnm>HD</fnm>
               </au>
               <au>
                  <snm>Gerdes</snm>
                  <fnm>J</fnm>
               </au>
            </aug>
            <source>J Cell Biol</source>
            <pubdate>1993</pubdate>
            <volume>123</volume>
            <fpage>513</fpage>
            <lpage>522</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2200129</pubid>
                  <pubid idtype="pmpid" link="fulltext">8227122</pubid>
                  <pubid idtype="doi">10.1083/jcb.123.3.513</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B39">
            <title>
               <p>Proliferating cell nuclear antigen (PCNA) as a marker of dysplasia in oral mucosa.</p>
            </title>
            <aug>
               <au>
                  <snm>Martinez-Lara</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Gonzalez-Moles</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Ruiz-Avila</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>Bravo</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Ramos</snm>
                  <fnm>MC</fnm>
               </au>
               <au>
                  <snm>Fernandez-Martinez</snm>
                  <fnm>JA</fnm>
               </au>
            </aug>
            <source>Acta Stomatol Belg</source>
            <pubdate>1996</pubdate>
            <volume>93</volume>
            <fpage>29</fpage>
            <lpage>32</lpage>
            <xrefbib>
               <pubid idtype="pmpid">8986050</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B40">
            <title>
               <p>Significance, detection and markers of disseminated breast cancer cells.</p>
            </title>
            <aug>
               <au>
                  <snm>Lacroix</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Endocr Relat Cancer</source>
            <pubdate>2006</pubdate>
            <volume>13</volume>
            <fpage>1033</fpage>
            <lpage>1067</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1677/ERC-06-0001</pubid>
                  <pubid idtype="pmpid" link="fulltext">17158753</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B41">
            <title>
               <p>Identification of distinct elements of the stromal microenvironment that control human hematopoietic stem/progenitor cell growth and differentiation.</p>
            </title>
            <aug>
               <au>
                  <snm>Aiuti</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Friedrich</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Sieff</snm>
                  <fnm>CA</fnm>
               </au>
               <au>
                  <snm>Gutierrez-Ramos</snm>
                  <fnm>JC</fnm>
               </au>
            </aug>
            <source>Exp Hematol</source>
            <pubdate>1998</pubdate>
            <volume>26</volume>
            <fpage>143</fpage>
            <lpage>157</lpage>
            <xrefbib>
               <pubid idtype="pmpid">9472804</pubid>
            </xrefbib>
         </bibl>
         <bibl id="B42">
            <title>
               <p>DAVID: Database for Annotation, Visualization, and Integrated Discovery.</p>
            </title>
            <aug>
               <au>
                  <snm>Dennis</snm>
                  <fnm>G</fnm>
                  <suf>Jr</suf>
               </au>
               <au>
                  <snm>Sherman</snm>
                  <fnm>BT</fnm>
               </au>
               <au>
                  <snm>Hosack</snm>
                  <fnm>DA</fnm>
               </au>
               <au>
                  <snm>Yang</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Gao</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Lane</snm>
                  <fnm>HC</fnm>
               </au>
               <au>
                  <snm>Lempicki</snm>
                  <fnm>RA</fnm>
               </au>
            </aug>
            <source>Genome Biol</source>
            <pubdate>2003</pubdate>
            <volume>4</volume>
            <fpage>P3</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1186/gb-2003-4-5-p3</pubid>
                  <pubid idtype="pmpid" link="fulltext">12734009</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B43">
            <title>
               <p>Gene ontology: tool for the unification of biology. The Gene Ontology Consortium.</p>
            </title>
            <aug>
               <au>
                  <snm>Ashburner</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Ball</snm>
                  <fnm>CA</fnm>
               </au>
               <au>
                  <snm>Blake</snm>
                  <fnm>JA</fnm>
               </au>
               <au>
                  <snm>Botstein</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Butler</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Cherry</snm>
                  <fnm>JM</fnm>
               </au>
               <au>
                  <snm>Davis</snm>
                  <fnm>AP</fnm>
               </au>
               <au>
                  <snm>Dolinski</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Dwight</snm>
                  <fnm>SS</fnm>
               </au>
               <au>
                  <snm>Eppig</snm>
                  <fnm>JT</fnm>
               </au>
               <au>
                  <snm>Harris</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Hill</snm>
                  <fnm>DP</fnm>
               </au>
               <au>
                  <snm>Issel-Tarver</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Kasarskis</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Lewis</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Matese</snm>
                  <fnm>JC</fnm>
               </au>
               <au>
                  <snm>Richardson</snm>
                  <fnm>JE</fnm>
               </au>
               <au>
                  <snm>Ringwald</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Rubin</snm>
                  <fnm>GM</fnm>
               </au>
               <au>
                  <snm>Sherlock</snm>
                  <fnm>G</fnm>
               </au>
            </aug>
            <source>Nat Genet</source>
            <pubdate>2000</pubdate>
            <volume>25</volume>
            <fpage>25</fpage>
            <lpage>29</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/75556</pubid>
                  <pubid idtype="pmpid" link="fulltext">10802651</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B44">
            <title>
               <p>Efficacy and safety of imatinib mesylate in advanced gastrointestinal stromal tumors.</p>
            </title>
            <aug>
               <au>
                  <snm>Demetri</snm>
                  <fnm>GD</fnm>
               </au>
               <au>
                  <snm>von Mehren</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Blanke</snm>
                  <fnm>CD</fnm>
               </au>
               <au>
                  <snm>Abbeele</snm>
                  <mnm>Van den</mnm>
                  <fnm>AD</fnm>
               </au>
               <au>
                  <snm>Eisenberg</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Roberts</snm>
                  <fnm>PJ</fnm>
               </au>
               <au>
                  <snm>Heinrich</snm>
                  <fnm>MC</fnm>
               </au>
               <au>
                  <snm>Tuveson</snm>
                  <fnm>DA</fnm>
               </au>
               <au>
                  <snm>Singer</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Janicek</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Fletcher</snm>
                  <fnm>JA</fnm>
               </au>
               <au>
                  <snm>Silverman</snm>
                  <fnm>SG</fnm>
               </au>
               <au>
                  <snm>Silberman</snm>
                  <fnm>SL</fnm>
               </au>
               <au>
                  <snm>Capdeville</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Kiese</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Peng</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Dimitrijevic</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Druker</snm>
                  <fnm>BJ</fnm>
               </au>
               <au>
                  <snm>Corless</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Fletcher</snm>
                  <fnm>CD</fnm>
               </au>
               <au>
                  <snm>Joensuu</snm>
                  <fnm>H</fnm>
               </au>
            </aug>
            <source>N Engl J Med</source>
            <pubdate>2002</pubdate>
            <volume>347</volume>
            <fpage>472</fpage>
            <lpage>480</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1056/NEJMoa020461</pubid>
                  <pubid idtype="pmpid" link="fulltext">12181401</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B45">
            <title>
               <p>Elevated expression of the AF1q gene, an MLL fusion partner, is an independent adverse prognostic factor in pediatric acute myeloid leukemia.</p>
            </title>
            <aug>
               <au>
                  <snm>Tse</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Meshinchi</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Alonzo</snm>
                  <fnm>TA</fnm>
               </au>
               <au>
                  <snm>Stirewalt</snm>
                  <fnm>DL</fnm>
               </au>
               <au>
                  <snm>Gerbing</snm>
                  <fnm>RB</fnm>
               </au>
               <au>
                  <snm>Woods</snm>
                  <fnm>WG</fnm>
               </au>
               <au>
                  <snm>Appelbaum</snm>
                  <fnm>FR</fnm>
               </au>
               <au>
                  <snm>Radich</snm>
                  <fnm>JP</fnm>
               </au>
            </aug>
            <source>Blood</source>
            <pubdate>2004</pubdate>
            <volume>104</volume>
            <fpage>3058</fpage>
            <lpage>3063</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1182/blood-2003-12-4347</pubid>
                  <pubid idtype="pmpid" link="fulltext">15217837</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B46">
            <title>
               <p>TMPRSS2 fusions with oncogenic ETS factors in prostate cancer involve unbalanced genomic rearrangements and are associated with HDAC1 and epigenetic reprogramming.</p>
            </title>
            <aug>
               <au>
                  <snm>Iljin</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Wolf</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Edgren</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Gupta</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Kilpinen</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Skotheim</snm>
                  <fnm>RI</fnm>
               </au>
               <au>
                  <snm>Peltola</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Smit</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Verhaegh</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Schalken</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Nees</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kallioniemi</snm>
                  <fnm>O</fnm>
               </au>
            </aug>
            <source>Cancer Res</source>
            <pubdate>2006</pubdate>
            <volume>66</volume>
            <fpage>10242</fpage>
            <lpage>10246</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1158/0008-5472.CAN-06-1986</pubid>
                  <pubid idtype="pmpid" link="fulltext">17079440</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B47">
            <title>
               <p>Recurrent fusion of TMPRSS2 and ETS transcription factor genes in prostate cancer.</p>
            </title>
            <aug>
               <au>
                  <snm>Tomlins</snm>
                  <fnm>SA</fnm>
               </au>
               <au>
                  <snm>Rhodes</snm>
                  <fnm>DR</fnm>
               </au>
               <au>
                  <snm>Perner</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Dhanasekaran</snm>
                  <fnm>SM</fnm>
               </au>
               <au>
                  <snm>Mehra</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Sun</snm>
                  <fnm>XW</fnm>
               </au>
               <au>
                  <snm>Varambally</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Cao</snm>
                  <fnm>X</fnm>
               </au>
               <au>
                  <snm>Tchinda</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Kuefer</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Lee</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Montie</snm>
                  <fnm>JE</fnm>
               </au>
               <au>
                  <snm>Shah</snm>
                  <fnm>RB</fnm>
               </au>
               <au>
                  <snm>Pienta</snm>
                  <fnm>KJ</fnm>
               </au>
               <au>
                  <snm>Rubin</snm>
                  <fnm>MA</fnm>
               </au>
               <au>
                  <snm>Chinnaiyan</snm>
                  <fnm>AM</fnm>
               </au>
            </aug>
            <source>Science</source>
            <pubdate>2005</pubdate>
            <volume>310</volume>
            <fpage>644</fpage>
            <lpage>648</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1126/science.1117679</pubid>
                  <pubid idtype="pmpid" link="fulltext">16254181</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B48">
            <title>
               <p>ArrayExpress - a public repository for microarray gene expression data at the EBI.</p>
            </title>
            <aug>
               <au>
                  <snm>Brazma</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Parkinson</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Sarkans</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Shojatalab</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Vilo</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Abeygunawardena</snm>
                  <fnm>N</fnm>
               </au>
               <au>
                  <snm>Holloway</snm>
                  <fnm>E</fnm>
               </au>
               <au>
                  <snm>Kapushesky</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Kemmeren</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Lara</snm>
                  <fnm>GG</fnm>
               </au>
               <au>
                  <snm>Oezcimen</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Rocca-Serra</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Sansone</snm>
                  <fnm>SA</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2003</pubdate>
            <volume>31</volume>
            <fpage>68</fpage>
            <lpage>71</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">165538</pubid>
                  <pubid idtype="pmpid" link="fulltext">12519949</pubid>
                  <pubid idtype="doi">10.1093/nar/gkg091</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B49">
            <title>
               <p>Gene Expression Omnibus: NCBI gene expression and hybridization array data repository.</p>
            </title>
            <aug>
               <au>
                  <snm>Edgar</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Domrachev</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Lash</snm>
                  <fnm>AE</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2002</pubdate>
            <volume>30</volume>
            <fpage>207</fpage>
            <lpage>210</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">99122</pubid>
                  <pubid idtype="pmpid" link="fulltext">11752295</pubid>
                  <pubid idtype="doi">10.1093/nar/30.1.207</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B50">
            <title>
               <p>Celsius: a community resource for Affymetrix microarray data.</p>
            </title>
            <aug>
               <au>
                  <snm>Day</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Carlson</snm>
                  <fnm>MR</fnm>
               </au>
               <au>
                  <snm>Dong</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>O'Connor</snm>
                  <fnm>BD</fnm>
               </au>
               <au>
                  <snm>Nelson</snm>
                  <fnm>SF</fnm>
               </au>
            </aug>
            <source>Genome Biol</source>
            <pubdate>2007</pubdate>
            <volume>8</volume>
            <fpage>R112</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">2394754</pubid>
                  <pubid idtype="pmpid" link="fulltext">17570842</pubid>
                  <pubid idtype="doi">10.1186/gb-2007-8-6-r112</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B51">
            <title>
               <p>GENEVESTIGATOR. Arabidopsis microarray database and analysis toolbox.</p>
            </title>
            <aug>
               <au>
                  <snm>Zimmermann</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Hirsch-Hoffmann</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Hennig</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Gruissem</snm>
                  <fnm>W</fnm>
               </au>
            </aug>
            <source>Plant Physiol</source>
            <pubdate>2004</pubdate>
            <volume>136</volume>
            <fpage>2621</fpage>
            <lpage>2632</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">523327</pubid>
                  <pubid idtype="pmpid" link="fulltext">15375207</pubid>
                  <pubid idtype="doi">10.1104/pp.104.046367</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B52">
            <title>
               <p>Probe set algorithms: is there a rational best bet?</p>
            </title>
            <aug>
               <au>
                  <snm>Seo</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Hoffman</snm>
                  <fnm>EP</fnm>
               </au>
            </aug>
            <source>BMC Bioinformatics</source>
            <pubdate>2006</pubdate>
            <volume>7</volume>
            <fpage>395</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1569879</pubid>
                  <pubid idtype="pmpid" link="fulltext">16942624</pubid>
                  <pubid idtype="doi">10.1186/1471-2105-7-395</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B53">
            <title>
               <p>Comparative analysis of microarray normalization procedures: effects on reverse engineering gene networks.</p>
            </title>
            <aug>
               <au>
                  <snm>Lim</snm>
                  <fnm>WK</fnm>
               </au>
               <au>
                  <snm>Wang</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Lefebvre</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Califano</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2007</pubdate>
            <volume>23</volume>
            <fpage>i282</fpage>
            <lpage>288</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/btm201</pubid>
                  <pubid idtype="pmpid" link="fulltext">17646307</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B54">
            <title>
               <p>Preferred analysis methods for Affymetrix GeneChips revealed by a wholly defined control dataset.</p>
            </title>
            <aug>
               <au>
                  <snm>Choe</snm>
                  <fnm>SE</fnm>
               </au>
               <au>
                  <snm>Boutros</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Michelson</snm>
                  <fnm>AM</fnm>
               </au>
               <au>
                  <snm>Church</snm>
                  <fnm>GM</fnm>
               </au>
               <au>
                  <snm>Halfon</snm>
                  <fnm>MS</fnm>
               </au>
            </aug>
            <source>Genome Biol</source>
            <pubdate>2005</pubdate>
            <volume>6</volume>
            <fpage>R16</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">551536</pubid>
                  <pubid idtype="pmpid" link="fulltext">15693945</pubid>
                  <pubid idtype="doi">10.1186/gb-2005-6-2-r16</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B55">
            <title>
               <p>Comparison of algorithms for the analysis of Affymetrix microarray data as evaluated by co-expression of genes in known operons.</p>
            </title>
            <aug>
               <au>
                  <snm>Harr</snm>
                  <fnm>B</fnm>
               </au>
               <au>
                  <snm>Schlotterer</snm>
                  <fnm>C</fnm>
               </au>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2006</pubdate>
            <volume>34</volume>
            <fpage>e8</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1345700</pubid>
                  <pubid idtype="pmpid" link="fulltext">16432259</pubid>
                  <pubid idtype="doi">10.1093/nar/gnj010</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B56">
            <title>
               <p>Correlation test to assess low-level processing of high-density oligonucleotide microarray data.</p>
            </title>
            <aug>
               <au>
                  <snm>Ploner</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Miller</snm>
                  <fnm>LD</fnm>
               </au>
               <au>
                  <snm>Hall</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Bergh</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Pawitan</snm>
                  <fnm>Y</fnm>
               </au>
            </aug>
            <source>BMC Bioinformatics</source>
            <pubdate>2005</pubdate>
            <volume>6</volume>
            <fpage>80</fpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">1084343</pubid>
                  <pubid idtype="pmpid" link="fulltext">15799785</pubid>
                  <pubid idtype="doi">10.1186/1471-2105-6-80</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B57">
            <title>
               <p>Ensembl 2005.</p>
            </title>
            <aug>
               <au>
                  <snm>Hubbard</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Andrews</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Caccamo</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Cameron</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Chen</snm>
                  <fnm>Y</fnm>
               </au>
               <au>
                  <snm>Clamp</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Clarke</snm>
                  <fnm>L</fnm>
               </au>
               <au>
                  <snm>Coates</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Cox</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Cunningham</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>Curwen</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Cutts</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Down</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Durbin</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Fernandez-Suarez</snm>
                  <fnm>XM</fnm>
               </au>
               <au>
                  <snm>Gilbert</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Hammond</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Herrero</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Hotz</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Howe</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Iyer</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Jekosch</snm>
                  <fnm>K</fnm>
               </au>
               <au>
                  <snm>Kahari</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Kasprzyk</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Keefe</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Keenan</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Kokocinsci</snm>
                  <fnm>F</fnm>
               </au>
               <au>
                  <snm>London</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Longden</snm>
                  <fnm>I</fnm>
               </au>
               <au>
                  <snm>McVicker</snm>
                  <fnm>G</fnm>
               </au>
               <etal/>
            </aug>
            <source>Nucleic Acids Res</source>
            <pubdate>2005</pubdate>
            <volume>33</volume>
            <fpage>D447</fpage>
            <lpage>453</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="pmcid">540092</pubid>
                  <pubid idtype="pmpid" link="fulltext">15608235</pubid>
                  <pubid idtype="doi">10.1093/nar/gki138</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B58">
            <title>
               <p>Adjustment of systematic microarray data biases.</p>
            </title>
            <aug>
               <au>
                  <snm>Benito</snm>
                  <fnm>M</fnm>
               </au>
               <au>
                  <snm>Parker</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Du</snm>
                  <fnm>Q</fnm>
               </au>
               <au>
                  <snm>Wu</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Xiang</snm>
                  <fnm>D</fnm>
               </au>
               <au>
                  <snm>Perou</snm>
                  <fnm>CM</fnm>
               </au>
               <au>
                  <snm>Marron</snm>
                  <fnm>JS</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2004</pubdate>
            <volume>20</volume>
            <fpage>105</fpage>
            <lpage>114</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/btg385</pubid>
                  <pubid idtype="pmpid" link="fulltext">14693816</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B59">
            <title>
               <p>Fusing microarray experiments with multivariate regression.</p>
            </title>
            <aug>
               <au>
                  <snm>Gilks</snm>
                  <fnm>WR</fnm>
               </au>
               <au>
                  <snm>Tom</snm>
                  <fnm>BD</fnm>
               </au>
               <au>
                  <snm>Brazma</snm>
                  <fnm>A</fnm>
               </au>
            </aug>
            <source>Bioinformatics</source>
            <pubdate>2005</pubdate>
            <volume>21</volume>
            <issue>Suppl 2</issue>
            <fpage>ii137</fpage>
            <lpage>143</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1093/bioinformatics/bti1123</pubid>
                  <pubid idtype="pmpid" link="fulltext">16204093</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
         <bibl id="B60">
            <title>
               <p>Minimum information about a microarray experiment (MIAME)-toward standards for microarray data.</p>
            </title>
            <aug>
               <au>
                  <snm>Brazma</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Hingamp</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Quackenbush</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Sherlock</snm>
                  <fnm>G</fnm>
               </au>
               <au>
                  <snm>Spellman</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Stoeckert</snm>
                  <fnm>C</fnm>
               </au>
               <au>
                  <snm>Aach</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Ansorge</snm>
                  <fnm>W</fnm>
               </au>
               <au>
                  <snm>Ball</snm>
                  <fnm>CA</fnm>
               </au>
               <au>
                  <snm>Causton</snm>
                  <fnm>HC</fnm>
               </au>
               <au>
                  <snm>Gaasterland</snm>
                  <fnm>T</fnm>
               </au>
               <au>
                  <snm>Glenisson</snm>
                  <fnm>P</fnm>
               </au>
               <au>
                  <snm>Holstege</snm>
                  <fnm>FC</fnm>
               </au>
               <au>
                  <snm>Kim</snm>
                  <fnm>IF</fnm>
               </au>
               <au>
                  <snm>Markowitz</snm>
                  <fnm>V</fnm>
               </au>
               <au>
                  <snm>Matese</snm>
                  <fnm>JC</fnm>
               </au>
               <au>
                  <snm>Parkinson</snm>
                  <fnm>H</fnm>
               </au>
               <au>
                  <snm>Robinson</snm>
                  <fnm>A</fnm>
               </au>
               <au>
                  <snm>Sarkans</snm>
                  <fnm>U</fnm>
               </au>
               <au>
                  <snm>Schulze-Kremer</snm>
                  <fnm>S</fnm>
               </au>
               <au>
                  <snm>Stewart</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Taylor</snm>
                  <fnm>R</fnm>
               </au>
               <au>
                  <snm>Vilo</snm>
                  <fnm>J</fnm>
               </au>
               <au>
                  <snm>Vingron</snm>
                  <fnm>M</fnm>
               </au>
            </aug>
            <source>Nat Genet</source>
            <pubdate>2001</pubdate>
            <volume>29</volume>
            <fpage>365</fpage>
            <lpage>371</lpage>
            <xrefbib>
               <pubidlist>
                  <pubid idtype="doi">10.1038/ng1201-365</pubid>
                  <pubid idtype="pmpid" link="fulltext">11726920</pubid>
               </pubidlist>
            </xrefbib>
         </bibl>
      </refgrp>
   </bm>
</art>
