#!/usr/bin/perl use strict; ## ## count the number of pseudogenes for each query protein ## my %sprot_hash; { my $infile = "/nfs/bh1/zl/Swissprot/May31-2003/mouse/all.mouse.sptrembl.summary.tbl"; open( IN, $infile ) or die "cannot open $infile\n"; my $line; my $skip = ; while ( chomp( $line = ) ) { my @array = split ( /\t/, $line ); my ( $sprot_id, $pro_leng, $pri_ac, $sec_ac, $pro_name, $synonyms, $gene_name, $embl_nt, $embl_pro, $mim, $kw ) = split ( /\t/, $line ); $sec_ac =~ s/^\s*|\s*$//g; my @sec_ac_array = split ( /\;/, $sec_ac ); my $_ac; foreach $_ac ( $pri_ac, @sec_ac_array ) { $sprot_hash{$_ac}{pri_ac} = $pri_ac; $sprot_hash{$_ac}{sprot_id} = $sprot_id ; $sprot_hash{$_ac}{pro_leng} = $pro_leng ; $sprot_hash{$_ac}{sprot_id} = $sprot_id; $sprot_hash{$_ac}{sec_ac} = $sec_ac; $sprot_hash{$_ac}{pro_name} = $pro_name; $sprot_hash{$_ac}{gene_name} = $gene_name; $sprot_hash{$_ac}{mim} = $mim; $sprot_hash{$_ac}{kw} = $kw; } } close(IN); ################################ ## obtain RP/OR information ## ################################ my $rp_file = "/bh1/zl/mouse_pseudo/protein-queries/special.mouse.protein.tbl"; open( RP, $rp_file ) or die "cannot open $rp_file\n"; my $line; while ( chomp( $line = ) ) { my ( $ac, $comment, $comment2 ) = split ( /\t/, $line ); $sprot_hash{$ac}{comment} = $comment; $sprot_hash{$ac}{comment2} = $comment2; } close(RP); } my %pgene_hash; { ############## ## PSSD1 ## ############## my $infile = "./master.web.pssd1.gff"; open( IN, $infile ) or die "cannot open $infile\n"; my $skip = ; my $line; while ( chomp( $line = ) ) { my ( $pgene_id, $pgene_id_short, $chr, $chr_start, $chr_end, $strand, $band, $query, $query_start, $query_end, $query_len, $completeness, $eval, $ident_aa, $ident_DNA, $polya, $disable, $gc_pgene, $gc_100k, $isochore_class, $divergence, $class_new, $comment, $protein_name, $gene_name, $mim ) = split ( /\t/, $line ); $pgene_hash{$query}{PSSD1} += 1; } close(IN); ############ ## PSSD2 ## ############ my $infile = "./master.web.pssd2.gff"; open( IN, $infile ) or die "cannot open $infile\n"; my $skip = ; my $line; while ( chomp( $line = ) ) { my ( $pgene_id, $pgene_id_short, $chr, $chr_start, $chr_end, $strand, $band, $query, $query_start, $query_end, $query_len, $completeness, $eval, $ident_aa, $ident_DNA, $polya, $disable, $gc_pgene, $gc_100k, $isochore_class, $divergence, $class_new, $comment, $protein_name, $gene_name, $mim ) = split ( /\t/, $line ); $pgene_hash{$query}{PSSD2} += 1; } close(IN); } my $outfile = "mouse.count.PSSD1.tbl"; open( PSSD1, ">$outfile" ) or die "cannot open $outfile\n"; my $ac; my @sorted_keys = reverse sort { $pgene_hash{$a}{PSSD1} <=> $pgene_hash{$b}{PSSD1} } keys %pgene_hash; foreach $ac (@sorted_keys) { next if ( $pgene_hash{$ac}{PSSD1} <= 0 ) ; print PSSD1 join ( "\t", $ac, $sprot_hash{$ac}{sprot_id}, $pgene_hash{$ac}{PSSD1}, $sprot_hash{$ac}{pro_name}, $sprot_hash{$ac}{gene_name}, $sprot_hash{$ac}{comment} ) . "\n"; } close(PSSD1);