use GD::Simple; use Math::Trig; use List::Util qw[min max]; use List::Util 'shuffle'; use POSIX qw(ceil floor); # # # INITIALIZE VARIABLES # # # $maxRecGenes = 200; #Maximum Recommended Number of Genes $maxRecRegions = 100; #Maximum Recommended Number of Regions $maxThick = 3; #Maximum thickness for a line $maxIter = 5000; #Maximum number of iterations $outputFile = "graphCluster.png"; # # # ASSIGN PARAMETERS # # # #Default parameter assignement %paramHash = % {initializeParameters(0.05, 200,1,1,"graph_data","graph_connect")} ; #update default parameters with user inputs %paramHash = % {updateParams(\@ARGV, \%paramHash )}; errorCheck(\%paramHash); #assign parameter variables $regionFile = $paramHash{"--data"}; $connectionFile = $paramHash{"--connect"} ; $clusterParameter = $paramHash{"--clusterRegions"}; $orientParameter = $paramHash{"--orient"} ; $geneClusterParam = $paramHash{"--clusterGenes"} ; $scoreThresh = $paramHash{"--scoreThresh"} ; # # # READ IN REGIONS & GENES # # FROM DATA FILE # # # #Open Data File - to read in regions first if(!open(IN, "$regionFile")){die("Could not open $regionFile.\n\n")} $cnt = 0; #cnt used to count number of regions while($line=) { chomp($line); @temp = split(/\s+/, $line); $regName = $temp[0]; if(exists($regHashIndex{$regName})) #if already noted, then skip { } else #if we encounter the region for the first time, store info { $regHashIndex{$regName} = $cnt; #store region name in a hash ... $regArray[$cnt] = $regName; # ... and also an array $cnt = $cnt +1; } } $numRegions = $cnt; #Note total number of regions close(IN); #open Data File, again to read in genes, and scores if(!open(IN, "$regionFile")){die("Could not open $regionFile.\n\n")} $numGenes = 0; #numGenes used to keep track of total number of genes while($line= ) { $numGenes = $numGenes+1; chomp($line); @temp = split(/\s+/, $line); $gene = $temp[1]; #note gene name $regName = $temp[0]; #region name $geneScore{$gene} = $temp[2]; #note score in hash $geneRegion{$gene} = $temp[0]; #note region in hash if(exists($regHash{$regName})) #get total count of regions { $last{$regName} = $last{$regName} + 1; $regHash{$regName}[$last{$regName}] = $gene; } else #initialize structures with first occurence of the region { $regHash{$regName}[0] = $gene;#Keeps track of the gene in each region, in order $last{$regName} = 0; #last - notes the index of the last gene # in a region } } close(IN); print(stderr "Read in regions and genes...\n"); #Warning if there are too many genes if($numGenes>$maxRecGenes) { print(stderr "WARNING: Maximum recomended gene number ( $maxRecGenes ), exceeded in $regionFile\n\n"); } #Warning if there are too many regions if($numRegions>$maxRecRegions) { print(stderr "WARNING: Maximum recomended region number ( $maxRecRegions ), exceeded in $regionFile\n\n"); } # # # READ IN PAIRWISE GENE CONNECTIONS # # FROM CONNECTIONS FILE TO # # TABULATE "TRAFFIC" BETWEEN REGIONS # # # if(!open(IN2, "$connectionFile")) {die("Could not open $connectionFile.\n\n")} while($line=) { chomp($line); @temp = split(/\s+/,$line); if((exists($geneScore{$temp[0]}) & exists ($geneScore{$temp[1]}))) #If both genes are scored { #check and see if scores are both below # the threshold, if so, look for connections if(($geneScore{$temp[0]}<$scoreThresh) & ($geneScore{$temp[1]}<$scoreThresh)){ #tally up the total numbers of connections between the #regions the regions that the genes sit in... #dis contains the cumulative score of connections #between two regions # NOTE - need to initialize both ordered pairs if(exists($dis{$geneRegion{$temp[0]}}{ $geneRegion{$temp[1]} })){ $dis{$geneRegion{$temp[0]}}{ $geneRegion{$temp[1]}} = $temp[2]+ $dis{$geneRegion{$temp[0]}}{ $geneRegion{$temp[1]}}; $dis{$geneRegion{$temp[1]}}{ $geneRegion{$temp[0]}} = $temp[2]+ $dis{$geneRegion{$temp[1]}}{ $geneRegion{$temp[0]}}; } else #initialize dis if it is the first connection between 2 regions { $dis{$geneRegion{$temp[0]}}{ $geneRegion{$temp[1]}} = $temp[2]; $dis{$geneRegion{$temp[1]}}{ $geneRegion{$temp[0]}} = $temp[2]; } $dis2{$temp[0]}{$temp[1]} = $temp[2]; $dis2{$temp[1]}{$temp[0]} = $temp[2]; } } } close(IN2); # # # now cluster - # # if user has seleced the option # # # if(($clusterParameter=~/\d+/) && ($clusterParameter>0 ) ) { $numIterations = $clusterParameter; #then cluster print(stderr "Clustering regions ... \n\n"); #Initialize Best Ever ordering of regions, #Current ordering of regions (@pos), and score #as the initial entered condition @pos = @regArray; $x1 = pathtotalglobal(\@pos , \%dis); #calculate the total path @bestEver = @pos; $bestEverMin = $x1; #keep track of the number of iterations where their has been no change $noChange = 0; for($k=1; ($k<$numIterations)&& ($bestEverMin>0); $k++) #max number of iterations { $crossMin = $x1; if($noChange==0) { #Only examine positions, where there #are connections crossing other connections #Only do this, if there has been a change in the order @conflicts = checkpathconflicts(\@pos, \%dis); } #define the indices that we might pick... #remove all of the indices that are not connected to other regions #by a path that crosses others @indices = @{ nonZeroIndices(\@conflicts) }; #Pick a random index to work with, of those without conflicts @shuffled = shuffle(@indices); $indexa = $shuffled[0]; #How many potential choices are there? $at = $#indices+1; #Since that position will be scrutinize #we consider it "unconflicted" #to avoid looking at it in the next iteration $conflicts[$indexa] = 0; #Print Iteration #, selected locus print(stderr "\nIteration \#$k. Evaluating optimal fit of locus : "); print(stderr "(selected from $at) "); print(stderr "$pos[$indexa]"); for($p=0; $p<3; $p++) { for($shuf2 =0; $shuf2<$numRegions; $shuf2++) { if($p==0) { $changeType = "repositioning $pos[$indexa] from $indexa to $shuf2";} if($p==1) { $changeType = "switching $pos[$indexa] with $pos[$shuf2]";} if($p==2) { $ll = ($indexa-$shuf2) % $numRegions; $changeType = "inverting a $ll region long segment from $pos[$indexa] to $pos[$shuf2]";} if(($indexa == $shuf2) || (!(exists($dis{$pos[$indexa] })))) {} #If we are shifting to the position that we are in #or there are no connections out of $indexa #no need to evaluate else { @pos2 = @ {changeRegionOrder(\@pos,$indexa, $shuf2, $p)}; $x2 = pathtotalglobal(\@pos2 , \%dis); #calculate the total path #check to see if this particular region arrangement is the best #for this iteration if($x2<$crossMin) { printf(stderr "\n Can improve score from %0.3f to %0.3f by\n" ,$x1 ,$x2); print (stderr " $changeType"); $crossMin = $x2; @bestMin = @pos2; } } } } #check if this particular region arrangement is the best #seen yet if($crossMin < $bestEverMin) { print(stderr "\n\n\t**Obtained best optimization so far in iteration \#$k"); @bestEver = @bestMin; $bestEverMin = $crossMin; } #check if this particular region arrangement is an improvement #on the arrangement resulting from the previous iteration if($crossMin < $x1) { $noChange = 0; #since we have a change, we zero out no change parameter printf(stderr "\n\tImproved score from %.3f to %.3f\n", $x1, $crossMin); @pos = @bestMin; $x1 = $crossMin; } #if we have tried every single paraeter, we need to #shuffle to get out of a local minimum elsif( ($#indices+1) ==0) { $noChange = 0; @pos = @{softShuffle(\@pos)}; $x1 = pathtotalglobal(\@pos , \%dis); #calculate the total path printf(stderr "\n\t>>No improvement in Iteration \#$k...<<\n\tRandomly shuffling loci positions. \n\t(Note current best observed score is %0.3f)\n", $bestEverMin); } #otherwise, we have not altered any position, #and we increment the nochange parameter else { $noChange +=1; print(stderr "\nTotal number of no change steps: $noChange\n"); } } $x1 = pathtotalglobal(\@bestEver , \%dis); $x2 = pathtotalglobal(\@regArray, \%dis); printf(stderr "\n\nClustering complete!! Best score achieved is %0.3f\n", $bestEverMin); if($x1<$x2) { @regArray = @bestEver; } print(stderr "\nWriting order of clustered SNPs to file : clusteredOrder\n"); open(OUT, ">clusteredOrder"); for($ll=0; $ll<@regArray;$ll++) { print(OUT "$regArray[$ll]\n"); } } elsif ($clusterParameter eq "0") { print(stderr "No clustering of regions ... \n\n"); } else { $cnt = 0; if(!open(IN, "$clusterParameter")){die("Could not open the file $clusterParameter.\n\n");} print(stderr "Reading in $clusterParameter ... \n\n"); while($line =) { chomp($line); $regArray[$cnt] = $line; $cnt = $cnt + 1; if(exists($regHashIndex{$line})) { } else { die("could not find $line in $clusterParameter"); } } } #Cluster genes within regions if($geneClusterParam == 1 ) { print(stderr "\n\nOrdering genes....\n\n"); %bestRegHash = %regHash; $bestHashScore = 1000000000; @bigGeneArray = []; for($r=0; $r<$numRegions; $r++) { $thisRegion = $regArray[$r]; @notShuffled = @{$regHash{$thisRegion}}; #identify the number of active genes in this region ($activeGeneCnt , $indexPointer) = findActiveGenes(\@notShuffled, \%dis2 ); @activeGeneIndices = @{$indexPointer}; $totalGenes = $last{$thisRegion}+1; $iteratedPointer = defineIterates($activeGeneCnt, \@notShuffled, \@activeGeneIndices); @iterated = @{$iteratedPointer}; for($gi=0; $gi<@iterated; $gi++) { #Update the not shuffled parameter to represent #the latest arrangement of genes in this region @notShuffled = @{$regHash{$thisRegion} }; #rearrange the genes in the region @{$regHash{$thisRegion} } = @{$iterated[$gi]}; #score the genes, with genes in "this region" rearranged $geneArrangeScore[$gi] = scoreGeneArrangement(\@regArray, \%regHash, \%dis2 ); if($geneArrangeScore[$gi]<$bestHashScore ) { $bestHashScore = $geneArrangeScore[$gi]; @{$regHash{$thisRegion}} = @{$iterated[$gi]}; } else { @{$regHash{$thisRegion}} = @notShuffled; } } print(stderr "Finished ordering genes in $thisRegion". "\n $activeGeneCnt genes out of $totalGenes are connected" . "\n $gi iterations achieved a score of $bestHashScore\n"); } } else { print(stderr "\n\nNo ordering of genes....\n\n"); } #orient circle if($orientParameter ==1) { $mostConRegion = findMostConnectedRegion( \@regArray, \%dis) ; print(stderr "$mostConRegion will be oriented to the top.\n\n" ); @regArray = @ { orientCircle( \@regArray, $mostConRegion ) } ; } elsif($orientParameter eq "0") { } else { print(stderr "Orienting plot to place $orientParameter region at the top\n"); @regArray = @ { orientCircle( \@regArray, $orientParameter ) } ; } print(stderr "Creating an image file\n\n"); # # # Make a new image here # # # # Initialize a new image $img = GD::Simple->new(4000,4000); # Define the center of mass $cX = 2000; $cY = 2000; #Define the diameter and radius of the central circus $diam = 2300; $radius = $diam/2; # Define fraction of diameter that represents inner bound $innerBound = 0.95; #Define colors here $black = $img->colorAllocate(255,255,255); $blue = $img->colorAllocate(20,20,255); $grey = $img->colorAllocate(20,20,20); $greyII = $img->colorAllocate(130,130,170); $red = $img->colorAllocate(100,40,40); print(stderr "Drawing regions and genes...\n\n"); #Draw and label regions $cnt = 0; for($i=0; $i<@regArray; $i++){ print(stderr "\nDrawing $regArray[$i]...\n"); if(($i%2)==0){ $img->fgcolor($blue); $col = $blue; } else{ $img->fgcolor($grey); $col = $grey; } $region = $regArray[$i]; #label the region $deg = ($cnt+($last{$region}+0.5)/2)*(360/$numGenes); $rad = deg2rad($deg); $cc = sin($rad); $dd = cos($rad); $img->moveTo($cX+1.32*$radius*$cc,$cY-1.32*$radius*$dd); $img->font('Arial'); $img->fontsize(60); $img->angle($deg+270); $img->fgcolor($grey); $img->string("$region"); $img->fgcolor($col); $img->fgcolor('white'); $img->penSize(20,20); $deg = ($cnt-0.5)*(360/$numGenes); $rad = deg2rad($deg); $cc = sin($rad); $dd = cos($rad); $img->moveTo($cX+1.28*$radius*$cc,$cY-1.28*$radius*$dd); $img->angle($deg+270); $img->line($radius*0.05); $deg = ($cnt+($last{$region}+0.5))*(360/$numGenes); $rad = deg2rad($deg); $cc = sin($rad); $dd = cos($rad); $img->moveTo($cX+1.2*$radius*$cc,$cY-1.2*$radius*$dd); $img->angle($deg+270); $img->line($radius*0.2); $img->penSize(1,1); $img->fgcolor($col); print(stderr "Labeling:"); for($k=0; $k<=$last{$region}; $k++){ $gene = $regHash{$region}[$k]; print(stderr "$gene "); $deg = $cnt*(360/$numGenes); $rad = deg2rad($deg); $cc = sin($rad); $dd = cos($rad); $geneDeg{$gene} = $deg; $genePosX{$gene} = $cX+0.99*$innerBound*$radius*$cc; $genePosY{$gene} = $cY-0.99*$innerBound*$radius*$dd; $img->moveTo($cX,$cY); for($j=0; $j<50; $j++){ #$img->arc($diam*1.25-$j,$diam*1.25-$j,$deg-90-0.5*(360/$numGenes),$deg-90+0.5**(360/$numGenes),gdNoFill|gdEdged); } for($j=0; $j<1000; $j++){ #$img->arc($diam-$j,$diam-$j,$deg-90-0.5*(360/$numGenes),$deg-90+0.5*(360/$numGenes),gdNoFill|gdEdged); #$img->arc($diam-$j,$diam-$j,-90,-90+(360/$numGenes),gdNoFill|gdEdged); $deg = $cnt*(360/($numGenes)) + (($j-500)/1000)*(360/($numGenes)); $rad = deg2rad($deg); $cc = sin($rad); $dd = cos($rad); $img->moveTo($cX+$innerBound*$radius*$cc,$cY-$innerBound*$radius*$dd); $img->angle($deg+270); $img->line( (1-$innerBound)*$radius); $img->moveTo($cX+1.29*$radius*$cc,$cY-1.29*$radius*$dd); $img->angle($deg+270); $img->line(0.02*$radius); } $img->penSize(6,6); $img->fgcolor('white'); $img->moveTo($cX+$innerBound*$radius*$cc,$cY-$innerBound*$radius*$dd); $img->angle($deg+270); $img->line( (1-$innerBound)*$radius); $img->fgcolor($col); $img->penSize(1,1); if($geneScore{$gene}<=2) { $img->moveTo($cX+1.01*$radius*$cc,$cY-1.01*$radius*$dd); $img->font('Arial'); $img->fontsize(40); $img->angle($deg+270); if($geneScore{$gene}<=$scoreThresh){ $img->font('Arial:bold'); $img->fgcolor($grey); } else{ $img->fgcolor($greyII); } $img->string("-$gene"); $img->fgcolor($col); } $cnt = $cnt + 1; } } #Now Draw the connections if(!open(IN, "$connectionFile")){die("Could not open $connectionFile.\n\n")} while($line = ) { chomp($line); @temp = split(/\s+/, $line); $fir = $temp[0]; $sec = $temp[1]; if($temp[2]>3){$temp[2] = 3} $s = 0; $temp[2] = (1+$s)*$temp[2] - $s*3; if($temp[2]<0.01){$temp[2] = 0.01} if(exists($geneScore{$fir}) &exists ($geneScore{$sec}) & ($geneScore{$fir}<=$scoreThresh) & ($geneScore{$sec}<=$scoreThresh) & ($geneRegion{$fir} ne $geneRegion{$sec})) { $red = $img->colorAllocate(min(30+$temp[2]*200,255),50,50); $img->fgcolor($red); connec($genePosX{$fir}, $genePosY{$fir}, $genePosX{$sec}, $genePosY{$sec}, $temp[2]*15); # print(stderr "$fir-->$sec\n"); } } # convert into png data and write out to file print(stderr "\n\nPrinting to $outputFile\n\n"); open(OUT, ">$outputFile"); binmode OUT; print OUT $img->png; ################################################################ # # Sub functions # ################################################################ sub connec { my $fx = shift; my $fy = shift; my $sx = shift; my $sy = shift; my $thick = shift; my $res = 500; $img->penSize($thick,$thick); $img->moveTo($fx, $fy); my $midX = ($fx + $sx)/2; my $midY = ($fy + $sy)/2; $dist = sqrt(($fx-$sx)*($fx-$sx) + ($fy-$sy)*($fy-$sy)); for($i=0; $i<=$res; $i++) { my $dx = $fx + ($sx-$fx)*$i/$res; my $dy = $fy + ($sy-$fy)*$i/$res; my $ccx = $dx + (($dist/$diam)**0.25)*2*(1 - $i/$res)*($i/$res - 0)*($cX - $midX); my $ccy = $dy + (($dist/$diam)**0.25)*2*(1 - $i/$res)*($i/$res - 0)*($cY - $midY); $img->lineTo($ccx,$ccy); #$img->string("$i"); #print(stderr "$ccx $dx $ccy $dy $i\n"); } $img->lineTo($sx,$sy); } sub paths{ my $firIndex = shift; my $secIndex = shift; my $numRegions = shift; my $i = $firIndex+1; $i = ($i % $numRegions); my @arrayIndices; $cnt = 0; while($i != $secIndex) { $arrayIndices[$cnt] = $i; $cnt = $cnt + 1; $i = $i + 1; $i = $i % $numRegions; } return @arrayIndices; } sub pathtotal{ my $inda = shift; my $indb = shift; my $numSet = shift; my $posPoint = shift; my $disHash = shift; my @position = @{$posPoint}; my %distance = %{$disHash}; my @aa = paths($inda,$indb,$numSet); my @bb = paths($indb,$inda,$numSet); my $ii; my $jj; my $crosses = 0; if(exists($distance{$position[$inda]}{$position[$indb]})) { for($ii=0; $ii<@aa; $ii++) {for($jj=0; $jj<@bb; $jj++){ $crosses = $crosses + $distance{$position[$aa[$ii]]}{$position[$bb[$jj]]}; } } $crosses = $crosses * $distance{$position[$inda]}{$position[$indb]}; } else { $crosses = 0; } return $crosses; } sub pathtotalglobal{ my $posPoint = shift; my $disHash = shift; my @position = @{$posPoint}; my %distance = %{$disHash}; my $numSet = $#position + 1; my $inda; my $indb; my $ii; my $jj; $totalBig = 0; for($inda=0; $inda<($numSet -1); $inda++) { for($indb=$inda+1; $indb<$numSet; $indb++) { $total = 0; my @aa = paths($inda,$indb,$numSet); my @bb = paths($indb,$inda,$numSet); if(exists($distance{$position[$inda]}{$position[$indb]})) { my $xx = $distance{$position[$inda]}{$position[$indb]}; for($ii=0; $ii<@aa; $ii++) {for($jj=0; $jj<@bb; $jj++){ $total = $total + $xx*$distance{$position[$aa[$ii]]}{$position[$bb[$jj]]}; } } $totalBig = $total + $totalBig; } } } return($totalBig); } sub checkpathconflicts{ my $posPoint = shift; my $disHash = shift; my @position = @{$posPoint}; my %distance = %{$disHash}; my @total; my $numSet = $#position + 1; my $inda; my $indb; my $ii; my $jj; $totalBig = 0; for($inda=0; $inda<($numSet); $inda++) { $total[$inda] = 0; for($indb=0; $indb<($numSet); $indb++) { if($inda==$indb){} else { $total = 0; my @aa = paths($inda,$indb,$numSet); my @bb = paths($indb,$inda,$numSet); if(exists($distance{$position[$inda]}{$position[$indb]})) { my $xx = $distance{$position[$inda]}{$position[$indb]}; for($ii=0; $ii<@aa; $ii++) {for($jj=0; $jj<@bb; $jj++){ $total[$inda] = $total[$inda] + $xx*$distance{$position[$aa[$ii]]}{$position[$bb[$jj]]}; } } } } } } return @total; } sub genPermuTranspOrder { # @e is the original (unpermuted) list of elements. my @e = @_; # @o is the order we want to output the elements. # So @e[@o] is the permuted list. my @o = (0..$#e); # @p is the position of each element in @o, # that is, @o = (0..$#e)[@p] my @p = (0..$#e); # Note that it is also true that @p = (0..$#e)[@o]. # $d[$n] is the direction we are moving $e[$n]. my @d = (-1) x @e; # We return a code reference (closure) that each time # it is called will return the next permutation of @e. # Note that the first time this is called, it permutes # @e one step, so the calling code must output the # original @e before calling the iterator. return sub { # Start by assuming we'll move the last element of @e. my $n = $#e; my $j; for(;;) { # If we move this one, we'd set $p[$n] += $d[$n]. # So $j is the possible new value for $p[$n]. $j= $p[$n] + $d[$n]; # That is okay if $j is in range and # we aren't swapping with an element later in @e. last if 0 <= $j && $j <= $#e && $o[$j] < $n; # It wasn't okay. So move this element # in the other direction next time. $d[$n]= -$d[$n]; # And move to the left in @e. --$n; # If we are to the first element in @e # then we are done. return if $n < 1; } # $p[$n] is the (permuted) position of the $n'th # element ($e[$n]) we wish to swap in the direction # of $d[$n]. So we want to swap $o[ $p[$n] ] with # $o[ $p[$n]+$d[$n] ]. $j is already that second # offset so we'll set $i to be the first offset # and later we'll swap @o[$i,$j]. my $i = $p[$n]; # We need to keep @o = (0..$#e)[@p], so we need # swap corresponding elements of @p to match @o. # We should swap @p[@o[$i,$j]], but $o[$i] is $n. @p[ $n, $o[$j] ]= @p[ $o[$j], $n ]; @o[$j,$i] = @o[$i,$j]; # To confuse your instructor, the commented # line produces the permutations in a different # and strange "transposition order". # return @e[@p]; return @e[@o]; }; } sub findActiveGenes() { my $nsPointer = shift; my $dis2Pointer = shift; my @notShuffled = @{$nsPointer}; my %dis2 = %{$dis2Pointer}; my $activeGenes =0; my @activeGeneIndices = (); my $ag; for($ag=0; $ag<@notShuffled; $ag++) { if(exists($dis2{$notShuffled[$ag]})){ $activeGenes++; $activeGeneIndices[$#activeGeneIndices+1] = $ag; } } return($activeGenes, \@activeGeneIndices) } sub defineIterates() { my $activeGeneCnt = shift; my $nsPointer = shift; my @notShuffled = @{$nsPointer}; my $agiPointer = shift; my @activeGeneIndices = @{$agiPointer}; my @iterated = (); if($activeGeneCnt>4) { my $numGeneIters=24 ; for(my $gi=0; $gi<$numGeneIters; $gi++) { @shuffledGenes = shuffle(@notShuffled); @{$iterated[$gi]} = @shuffledGenes; } } elsif($activeGeneCnt<=1) { @{$iterated[0]} = @notShuffled; } else { #note the indices - initially unshuffled my @shuffledIndices = @activeGeneIndices; #define iterator my $iter = genPermuTranspOrder( @shuffledIndices ); my $iterInd = 0; #Permute through iterations do { #Note the genes prior to shuffling @shuffledGenes = @notShuffled; #for each gene, used shuffled active indices to reorder for(my $ix=0; $ix<@activeGeneIndices;$ix++){ $shuffledGenes[$activeGeneIndices[$ix]] = $notShuffled[$shuffledIndices[$ix]]; } @{$iterated[$iterInd]} = @shuffledGenes; $iterInd +=1; } while( @shuffledIndices = $iter->() ); } return(\@iterated); } sub scoreGeneArrangement() { my $raPointer = shift; my @regArray = @{$raPointer}; my $rhPointer = shift; my %regHash = %{$rhPointer}; my $gdPointer = shift; my %geneDis = %{$gdPointer}; my @bigGeneArray = []; for(my $rr=0; $rr<@regArray; $rr++) { $thisRegion2 = $regArray[$rr]; for(my $g=0; $g< @{$regHash{$thisRegion2}}; $g++) { $bigGeneArray[$#bigGeneArray+1] = $regHash{$thisRegion2}[$g]; } } return( pathtotalglobal(\@bigGeneArray , \%geneDis) ); } sub changeRegionOrder { my $posPointer = shift; my @pos = @{$posPointer}; my $indexa = shift; my $shuf2 = shift; my $p = shift; my @pos2 = @pos; #Move $indexa to another position if($p==0){ splice(@pos2, $indexa , 1); @temp =("$pos[$indexa]"); splice(@pos2, $shuf2, 0, @temp); } #Swap $indexa with another position if($p==1){ $pos2[$indexa] = $pos[$shuf2]; $pos2[$shuf2] = $pos[$indexa]; } #Invert the stretch from $indexa to another position if($p==2){ $steps = ($shuf2-$indexa+1); if($steps<0){$steps= $steps+$numRegions} for($ap=indexa; $ap<$steps; $ap++) { $ind1 = ($indexa+$ap)% $numRegions; $ind2 = ($shuf2-$ap) % $numRegions; $pos2[$ind1] = $pos[$ind2]; } } return(\@pos2); } sub nonZeroIndices { my $conflictPointer = shift ; my @conf = @{$conflictPointer}; my @nonZeroIndex = (); for(my $i=0; $i<@conf; $i++) { if($conf[$i] ==0){} else{ $nonZeroIndex[$#nonZeroIndex+1] = $i } } return(\@nonZeroIndex); } sub softShuffle { #shuffles about half of the positions my $posPointer = shift ; my @pos = @{$posPointer}; my $numRegions = $#pos + 1; my @indices = (1 .. ($numRegions-1)); my @shuffled = shuffle(@indices); #shuffle only half of the regions my @shuffled1 = @shuffled[0..floor($numRegions/2)]; my @shuffled2 = shuffle(@shuffled[0..floor($numRegions/2)]); my @pos2 = @pos; @pos2[@shuffled1] = @pos[@shuffled2]; return(\@pos2); } sub findMostConnectedRegion { my $regPointer = shift; my $disPointer = shift; my @regArray = @{$regPointer}; my %dis = %{$disPointer}; my $maxScore = 0; my $maxIndex = 0; my @scoreComplexity; for(my $ll=0; $ll<@regArray;$ll++) { $scoreComplexity[$ll] = 0; foreach my $key (keys(%{$dis{$regArray[$ll]}})) { $scoreComplexity[$ll] = $scoreComplexity[$ll] + $dis{$regArray[$ll]}{$key}; } if($scoreComplexity[$ll]>$maxScore) { $maxScore = $scoreComplexity[$ll]; $maxIndex = $ll; } } return( $regArray[$maxIndex] ) } sub orientCircle { my $regPointer = shift; my $orientLabel = shift; my @regArray = @{$regPointer}; my $orientInd = -1; my $newInd; my @regArray2; my $i; my $numReg = $#regArray + 1; for($i=0; $i<@regArray; $i++) { if($regArray[$i] eq $orientLabel){$orientInd = $i;} } for($i=0; $i<@regArray; $i++) { $newInd = ($i + $orientInd) % $numReg; #print(stderr "$i $newInd\n"); $regArray2[$i] = $regArray[$newInd]; } @regArray = @regArray2; ### return (\@regArray2); } sub initializeParameters { my %hash ; $hash{"--scoreThresh"} = shift; #Default score threshold $hash{"--clusterRegions"}= shift; #Default, cluster regions with 200 iterations $hash{"--clusterGenes"} = shift; #Default, cluster genes within regions $hash{"--orient"} = shift; #Default, orient circle $hash{"--data"} = shift; $hash{"--connect"} = shift; return (\%hash); } sub updateParams { my $inputPointer = shift; my $hashPointer = shift; my @args = @{$inputPointer}; my %paramHash = %{$hashPointer}; my %newParams = %paramHash; for(my $i=0; $i<@args; $i+=2) { die("\n\n\"$args[$i]\" is not an appropriate parameter flag\nAppropriate parameter flags are \"--data\", \"--connect\", \"--clusterRegions\", \"--scoreThresh\", \"--clusterGenes\", and \"--orient\"\n\n") unless(exists($paramHash{$args[$i]})); die("\n\nNo argument for \"$args[$i]\" entered\n\n") unless(exists($args[$i+1])); $newParams{$args[$i]} = $args[$i+1]; } return(\%newParams); } sub errorCheck { my $paramPointer = shift; my %params = %{$paramPointer}; my $line; my @reg; my %geneCheck; my %regCheck; my $flag; my %clusterFileRegions; my $regionFile = $params{"--data"}; my $connectionFile = $params{"--connect"} ; my $clusterParameter = $params{"--clusterRegions"}; my $orientParameter = $params{"--orient"} ; my $geneClusterParam = $params{"--clusterGenes"} ; my $scoreThresh = $params{"--scoreThresh"} ; #check data (regions) file #check that it opens if(!open(IN, $regionFile)){die("Could not open $regionFile file.\n\n")} #check the format, line by line $flag = ""; while($line=) { chomp($line); @reg = split(/\s+/, $line); #check format for each line $flag = $flag . "Formatting problem in $regionFile:\nline==>$line.\n" unless( ($reg[0]=~/.+/) && ($reg[1]=~/.+/) && ($reg[2]=~/^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/)); #check and see if any gene is lised twice $flag = $flag . "The $reg[1] gene is in $regionFile multiple times.\n" unless(!exists($geneCheck{$reg[1]})); $geneCheck{$reg[1]} = 1; $regCheck{$reg[0]} = "1"; #check to make sure score/pvals are between 0 and 1 $flag = $flag . "P-value out of range in $regionFile:\n==>$line.\n" unless ( ($reg[2]<=1) && ($reg[2]>=0)) } if(length($flag)>0){die("Error in $regionFile (--data) file:\n" . $flag)} close(IN); #ERROR - connection File if(!open(IN, "$connectionFile")){die("Could not open $connectionFile.\n\n")} #check connection file format $flag = ""; while($line=) { chomp($line); @reg = split(/\s+/, $line); #check formatting $flag = $flag . "Formatting problem in $connectionFile:\n$==>$line.\n" unless( ($reg[0]=~/.+/) && ($reg[1]=~/.+/) && ($reg[2]=~/^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/)); #make sure pairs are not being duplicated $flag = $flag . "==>$line\n\nThe $reg[1] and $reg[0] gene pair is in $connectionFile multiple times.\n" unless(!exists($pairCheck{$reg[1]}{$reg[0]})); $pairCheck{$reg[1]}{$reg[0]} = 1; $pairCheck{$reg[0]}{$reg[1]} = 1; #check and make sure both genes are in the date file $flag = $flag . "$reg[0] gene in $connectionFile is not listed as a gene in $regionFile.\n" unless($geneCheck{$reg[0]}); $flag = $flag . "$reg[1] gene in $connectionFile is not listed as a gene in $regionFile.\n" unless($geneCheck{$reg[1]}); print(stderr "WARNING : In $connectionFile, in the line\n==>$line\nThere is a connectivity score > $maxThick ... it will be corrected to $maxThick.\n\n") unless($reg[2]<=$maxThick); } close(IN); if(length($flag)>0){die("Error in $connectionFile (--connect) file:\n" . $flag)} # Cluster Parameter #check to see if it is a file name if(open(IN, $clusterParameter)) { $flag = ""; #check that each region is in the Region Data file while($line=) { chomp($line); $clusterFileRegions{$line} = 1; $flag = $flag . "The $line is not a region listed in $regionFile\n" unless(exists($regCheck{$line})); } #check that each region in the Region Data file is in this file foreach $region (keys(%regCheck)) { $flag = $flag . "The $region region listed in the $regionFile region file is not listed in the $clusterParameter clustering file\n" unless(exists($clusterFileRegions{$region})); } if(length($flag)>0){die("Error in $clusterParameter (--clusterRegions) file:\n" . $flag)} } else { die("Cluster Parameter (3rd argument) must be either 1 (no clustering), a value between 2 and $maxIter (number of clustering iterations), or an appropriate file name") unless ( ( ($clusterParameter =~ /^\d+$/)&&($clusterParameter<=$maxIter)) ||open(IN, $clusterParameter)); } #check scoreThresh die("The $scoreThresh parameter must be a value between 0 and 1\n") unless( ($scoreThresh=~/^[+]?[0-9]*\.?[0-9]+$/) && ($scoreThresh<=1) && ($scoreThresh>=0)); #check orient parameter die("The $orientParameter must be 0,1, or a region named in the region (--data) file") unless( ($regCheck{$orientParameter}) || ($orientParameter eq "0") || ($orientParameter eq "1")); #check gene cluster parameter print(stderr "$geneClusterParam\n\n"); die("The $geneClusterParam (--clusterGenes parameter) must be 0 or 1") unless( ($geneClusterParam eq "0") || ($geneClusterParam eq "1")); }