use LWP::Simple; # # # INITIALIZE VARIABLES # # # #Initalize the GRAIL website address where results will be #obtained from $grailSite = 'http://www.broadinstitute.org/mpg/grail/results/'; #Initialize CutOffDates parameters that define the point #at which GRAIL website became compatible with this script $cutOffDate = 9; $cutOffMonth = 2; $cutOffYear = 2011; # # # GET USER INPUT # # # #get GRAIL ID from user $id = $ARGV[0]; if(length($id)==0) {$id = "1297250676";} if(exists($ARGV[1])) {$thinParam = $ARGV[1];} else {$thinParam = 0;} # # # GET ONLINE INPUT # # # print("Getting the results of GRAIL Run $id online...\n"); #Get results (out.html) url from the GRAIL site my $url1 = $grailSite . $id . '_out.html'; my $content = get $url1; #Get query regions from GRAIL my $url2 = $grailSite . $id . '_querySetRegions.html'; my $regions = get $url2; # # # ERROR CHECK ONLINE INPUT # # # #Make sure online files exist die "Couldn't get $url.\n\n" unless defined $content; die "Couldn't get $url.\n\n" unless defined $regions; #Make sure that basic (_out.html) structure is intact die "File error - please check $url1.\n\n" unless (($content =~ /\Analysis Summary\<\/b\>/) && ($content =~ /Genomic Regions/) && ($content =~ /REGION/)); #Make sure that basic (_querySetRegions.html) structure is intact die "File error - please check $url2.\n\n" unless (($regions =~ /REGION\s+CHR\s+START\s+STOP\s+GENES/)); #Make sure seed and query regions were set equal in the run die "Query and seed regions not equal.\n\n" unless($content =~ /Queries and Seed Regions are set equal/); #Make sure that the GRAIL query was since the cutOffDate #get DATE of query $content =~ /TIME\: (\d\d\d\d)-(\d\d)-(\d\d)/; $year = $1; $month = $2+0; $date = $3+0; #check date die "GRAIL query must be after $cutOffYear-$cutOffMonth-$cutOffDate to be compatabile with this script.\n\n" unless( ($year>$cutOffYear) || (($year==$cutOffYear)&&($month>$cutOffMonth)) ||(($year==$cutOffYear)&&($month==$cutOffMonth)&& ($date>=$cutOffDate))); #Make sure that the thinParameter is a floating point number die "the second argument, the thinning parameter must be a positive floating point number" unless($thinParam =~ /^[+]?[0-9]*\.?[0-9]+$/); # # # EXTRACT INFORMATION ABOUT REGIONS & GENES # # # #go through each line of the region url content, #one at a time, extract names of genes, and regions @eachRegion = split(/\n/, $regions); for($i=1; $i<@eachRegion; $i++) { @eachRegionLine = split(/\t/, $eachRegion[$i]); $regionName = $eachRegionLine[0]; #get the region name $regionNameArray[$i-1] = $regionName; #store region name in Array #Pull out gene names from each line $eachRegionLine[4] =~ s/<[^>]+>//g; #remove url tags @geneList = split(/\s+/, $eachRegionLine[4]); $flag = 0; #flag variable - if a gene is mapping to multiple regions # flag will become marked 1. for($j=0; $j<@geneList;$j++) { $geneName = $geneList[$j]; #Pull the actual gene Name #Error Check -check if the gene has already been mapped to some other region if( (exists($mapGeneToRegion{$geneName})) && ( $regionName ne $mapGeneToRegion{$geneName})) { print(stderr "The $geneName gene maps to multiple regions : $mapGeneToRegion{$geneName} and $regionName.\n\n"); $flag = 1; } #If the gene is already mapped don't double count elsif(exists($mapGeneToRegion{$geneName})) { } #If the gene is not mapped to this reigon, note the region else { #store - map of genes to regions - in a hash $mapGeneToRegion{$geneName} = $regionName; #store - list of genes for each region in an hash. $regionNameHash{$regionName}{$geneName} =1; #count - genes per region if(exists( $regionGeneCount{$regionName})){ $regionGeneCount{$regionName} = $regionGeneCount{$regionName}+1;} else{ $regionGeneCount{$regionName} = 1;} #if 1st gene, initilize } } if($flag==1) { die("Genes mapping to multiple regions, please correct and rerun GRAIL query\n\n"); } } # # # EXTRACT GENE & PAIRWISE GENE SIMILARITY INFORMATION # # # #Go through the output file, line by line, pull key information @contentArray = split(/\n/, $content); #split output content by line $t = ""; #Intialize t; line at which we get gene specific informaiton $GN = ""; # initialize Gene Count #Go through each output line by line for($i=0;$i<@contentArray;$i++) { #Pull out the number of genes in the analysis if($contentArray[$i] =~ /^A total of (\d+) genes/) { $GN = $1; } #go through each line until we find the "GENE" #header if($contentArray[$i] =~ /^GENE/) { $t = $i+1; } #if the line is past GENE, then get information if($i>$t && length($t)) { $contentArray[$i] =~ s/<[^>]+>//g; #remove url tags $contentArray[$i] =~ s/\,//g; #remove commas #get each line that has information about a gene @geneInfo = split(/\s+/, $contentArray[$i]); $gene = $geneInfo[0]; #Get Gene name #ERROR CHECK - Make sure that the gene maps to a region print stderr "Warning : The $gene gene does not map to any gene region.\n\n" unless (exists($mapGeneToRegion{$gene})); $pvalues{$gene} = $geneInfo[1]; #Pull Gene pvalue for($j=2; $j<@geneInfo; $j++) { #pull out similar genes, and ranks $geneInfo[$j] =~ /(.+)\((\d+)\)/; $gene2 = $1; $rank = $2; #ERROR CHECK - similar genes must map to regions also print stderr "WARNING: The $gene2 gene does not map to any gene region.\n\n" unless (exists($mapGeneToRegion{$gene})); #Store rank similarity $pairRanks{$gene}{$gene2} = $rank; #Convert The Rank into a connectivity SCORE #Mult hyp test correction the percentile of the rank, #for number of genes $score{$gene}{$gene2} = 1- ( (1-$rank/$GN)**$regionGeneCount{$mapGeneToRegion{$gene2}} ); #Convert p-val of similarity into a thickness score if(exists ($regionGeneCount{$mapGeneToRegion{$gene2}})) {$score{$gene}{$gene2} = ($score{$gene}{$gene2}<0.1) #Must be <0.1 *log($score{$gene}{$gene2}/0.1)*(-1/log(10)); #log scale it } } } } # # # PRINT OUT DATA FILE # # CONNECTS REGIONS TO GENES # # ... AND HAS GENE SCORES (PVALS) # # # print("Writing connection and data files for GRAIL Run $id...\n"); open(DATA, ">$id\_data"); #Iterate through each region foreach $region (keys(%regionNameHash)) { #pull genes from each region foreach $gene (keys(%{ $regionNameHash{$region} })) { if(!(exists($pvalues{$gene}))) { $pvalues{$gene} = 1; } #Print - REGION, GENE, PVAL print(DATA "$region\t$gene\t$pvalues{$gene}\n"); } } close(DATA); # # # PRINT OUT CONNECTION FILE # # CONNECTS GENES TO GENES # # ... AND STRENGHT OF CONNECTION (SCORE) # # # open("CON", ">$id\_connections"); @allGenes = (keys(%pvalues)); #array of all genes #Iterate through all gene pairs for($i=0; ($i+1)<@allGenes; $i++) { $gene = $allGenes[$i]; for($j=($i+1); $j<@allGenes; $j++) { $gene2 = $allGenes[$j]; #If pair not scored, set to zero #Pair is ordered, so need to look at both # ordered pairs if(!exists($score{$gene}{$gene2})) {$score{$gene}{$gene2} = 0;} if(!exists($score{$gene2}{$gene})) {$score{$gene2}{$gene} = 0 ;} #average score for each ordered pair $num = ($score{$gene2}{$gene} + $score{$gene}{$gene2})/2; #print only non-zero pairs if($num>3){$num=3}; #thin out $num $num = (1+$thinParam)*$num - $thinParam*3; if($num<0) {$num=0} #PRINT ONLY GENES THAT CAN BE MAPPED if( ($num>0) && (exists($mapGeneToRegion{$gene})) && (exists($mapGeneToRegion{$gene2})) ){ print(CON "$gene\t$gene2\t$num\n"); } } }