Converting Entrez XML to GFF3

This last week represents the second time I have tried to access gene annotations for a particular genome of interest, but quickly lost interest since the data were only available in complicated, extremely verbose, and poorly documented XML or ASN.1 formats. Luckily, the first time this happened (a few months ago), I tried hopping on NCBI’s FTP site and was able to find annotations in a GFF-like tab-delimited format, which I was able to easily convert to GFF3. This last week, however, NCBI’s FTP site (and their help desk, for that matter) were no help for finding usable gene annotations in a tab-delimited format.

I finally decided to buck up, bite the bullet, and write a conversion script myself (if these conversion scripts counted toward advancement, I’d have tenure by now). Most of my experience with XML comes from my web development days where I primarily used PHP’s SimpleXML library for parsing and processing XML data. I’m sure Perl and C (and probably all the other common languages) have XML-processing libraries that are just fine, but I decided to implement this script in PHP so that I could work in a familiar environment and complete the task as quickly as possible so I can get back to my research.

Anyway, below is the latest draft of the conversion script I implemented.

#!/usr/bin/env php

This program takes a single argument on the command line: the path of a file containing Entrez XML-formatted gene annotations.
GFF3-formatted annotations are printed to STDOUT.

ini_set("memory_limit", -1);
assert_options(ASSERT_BAIL, false);
assert_options(ASSERT_WARNING, false);
$strands = array("plus" => "+", "minus" => "-");
$encode_search = array(';', '=', '%', '&', ',');
$encode_replace = array('%3B', '%3D', '%25', '%26', '%2C');

$xmlfile = $argv[1];
$xmldata = simplexml_load_file($xmlfile);

$genes = $xmldata->xpath('/Entrezgene-Set/Entrezgene');

function assertordie($condition, $message)
  assert($condition) or fprintf(STDERR, "Assert error: $message\n") and die();

foreach($genes as $gene)
  // Gene feature data
  $gene_ui   = $gene->{'Entrezgene_gene-source'}->{'Gene-source'}->{'Gene-source_src-int'};
  $gene_acc  = $gene->{'Entrezgene_gene'}->{'Gene-ref'}->{'Gene-ref_locus'};
  $gene_acc = str_replace($encode_search, $encode_replace, $gene_acc);
  $gene_desc = $gene->{'Entrezgene_gene'}->{'Gene-ref'}->{'Gene-ref_desc'};
  $gene_desc = str_replace($encode_search, $encode_replace, $gene_desc);
  $gene_comm = $gene->xpath('Entrezgene_locus/Gene-commentary');
  $gene_comm_count = 0;
  if(sizeof($gene_comm) > 1)
    fprintf(STDERR, "Warning: assuming that locus '%s (%s)' contains %d genes\n", $gene_acc, $gene_ui, sizeof($gene_comm));

  foreach($gene_comm as $comm)
    $comm_ui = $gene_ui;
    if(sizeof($gene_comm) > 1)
      $comm_ui = sprintf("%s.g%d", $gene_ui, $gene_comm_count);
    $gene_seq = $comm->{'Gene-commentary_accession'};
    $gene_intervals = $comm->xpath('Gene-commentary_seqs/Seq-loc/Seq-loc_int/Seq-interval');
    assertordie(sizeof($gene_intervals) == 0 or sizeof($gene_intervals) == 1, sprintf("number of intervals for gene '%s (%s)': expected=%s, actual=%d", $gene_acc, $comm_ui, "[0,1]", sizeof($gene_intervals)));
    if(sizeof($gene_intervals) == 0)
      fprintf(STDERR, "Warning: gene '%s (%s)' contains no genomic intervals, assuming it's a deprecated gene, skipping\n", $gene_acc, $comm_ui);

    $gene_products = $comm->xpath('Gene-commentary_products/Gene-commentary[Gene-commentary_type/@value="mRNA"]');
    if(sizeof($gene_products) < 1)
      fprintf(STDERR, "Warning: gene '%s (%s)' contains no mRNA products, assuming it's a duplicates gene, skipping\n", $gene_acc, $comm_ui);
    $strand_attributes = $gene_intervals[0]->{"Seq-interval_strand"}->{"Na-strand"}->attributes();
    $gstrand = $strands[(string)$strand_attributes["value"]];
    $gattributes = sprintf('ID=%s;Name=%s;Note="%s"', $comm_ui, $gene_acc, $gene_desc);
    $gstart = (int)$gene_intervals[0]->{"Seq-interval_from"} + 1;
    $gend   = (int)$gene_intervals[0]->{"Seq-interval_to"} + 1;
    printf("%s\t%s\tgene\t%d\t%d\t.\t%s\t.\t%s\n", $gene_seq, "Entrez", $gstart, $gend, $gstrand, $gattributes);

    // mRNA feature data
    foreach($gene_products as $mrna)
      // mRNA and exon features
      $tui = $mrna->{"Gene-commentary_accession"};
      $exons = $mrna->xpath('Gene-commentary_genomic-coords/Seq-loc/Seq-loc_mix/Seq-loc-mix/Seq-loc/Seq-loc_int/Seq-interval');
      if(sizeof($exons) == 0)
        $exons = $mrna->xpath('Gene-commentary_genomic-coords/Seq-loc/Seq-loc_int/Seq-interval');
        assertordie(sizeof($exons ==1), sprintf("number of exons for transcript '%s': expected=%d, actual=%d", $tui, 1, sizeof($exons)));
      $tattributes = sprintf('ID=%s;Parent=%s', $tui, $comm_ui);

      $tcoords = array();
      foreach($exons as $exon)
        $tcoords[] = $exon->{"Seq-interval_from"};
        $tcoords[] = $exon->{"Seq-interval_to"};
      $tstart = min($tcoords) + 1;
      $tend   = max($tcoords) + 1;

      // protein and CDS features
      $transcript_products = $mrna->xpath('Gene-commentary_products/Gene-commentary[Gene-commentary_type/@value="peptide"]');
      assertordie(sizeof($transcript_products == 1), sprintf("number of products for transcript '%s': expected=%d, actual=%d", $tui, 1, sizeof($transcript_products)));
      $protein = $transcript_products[0];
      $pui = $protein->{"Gene-commentary_accession"};
      $cds_segments = $protein->xpath('Gene-commentary_genomic-coords/Seq-loc/Seq-loc_mix/Seq-loc-mix/Seq-loc/Seq-loc_int/Seq-interval');
      if(sizeof($cds_segments) == 0)
        $cds_segments = $protein->xpath('Gene-commentary_genomic-coords/Seq-loc/Seq-loc_int/Seq-interval');      
      $pcoords = array();
      foreach($cds_segments as $cds)
        $pcoords[] = $cds->{"Seq-interval_from"};
        $pcoords[] = $cds->{"Seq-interval_to"};
      $pstart = min($pcoords) + 1;
      $pend   = max($pcoords) + 1;
      $pattributes = sprintf("ID=%s;Parent=%s", $pui, $tui);

      // Print out all features
      printf("%s\t%s\tmRNA\t%d\t%d\t.\t%s\t.\t%s\n", $gene_seq, "Entrez", $tstart, $tend, $gstrand, $tattributes);
      printf("%s\t%s\tprotein\t%d\t%d\t.\t%s\t.\t%s\n", $gene_seq, "Entrez", $pstart, $pend, $gstrand, $pattributes);
      foreach($exons as $exon)
        $estart = (int)$exon->{"Seq-interval_from"} + 1;
        $eend   = (int)$exon->{"Seq-interval_to"} + 1;
        printf("%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s\n", $gene_seq, "Entrez", $estart, $eend, $gstrand, $tui);
      foreach($cds_segments as $cds)
        $cstart = (int)$cds->{"Seq-interval_from"} + 1;
        $cend   = (int)$cds->{"Seq-interval_to"} + 1;
        printf("%s\t%s\tCDS\t%d\t%d\t.\t%s\t.\tParent=%s\n", $gene_seq, "Entrez", $cstart, $cend, $gstrand, $tui);

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )


Connecting to %s