$file="filename"; open example, "$file" or die "Could not open file
$!"; @text = ; print "Here is the text file that the script will perform operations on:\n\n"; print (@text); close (example); @names=(); @sequences=(); $counter=0; print "\nThis script will take the text file, and separate the sequences fromthe sequence names, placing them in sequence and name arrays, respectively"; foreach $line(@text) { if ($line=~ /^>/) {push(@names, $line)} elsif($line=~/^\s*/) {push (@sequences, $line)}} @sequences = grep /\S/, @sequences; print "\n\nNext, it will remove the whitespace from each of the elements of the sequences array"; foreach $sequence(@sequences) {$sequence=~ s/\s//g}; print "\n\nFinally, it will print the names and sequences arrays."; print "\n\nThis is the names array:\n@names"; print "\nThis is the sequences array:\n@sequences"; print "\n\nNow, we want the program to loop through each of the sequence we've extracted from the data input file, and then to open an empty list that will accept the completely processed sequences."; @finished=(); print "\n\nHere there begins a loop that separately processes each of the sequences, and places each pair (top, bottom) into a the array of processed sequences."; foreach $sequence (@sequences) { print "\n\nHere is the sequence i'm working on now:\n$sequence"; print "\n\nFirst, we find the complement of the sequence:"; $sequence=~tr/ATCG/TAGC/; print "\n\n$sequence."; print "\n\nNext, we define an empty array that will contain the fully processed bottom and top strands for this sequence."; @completedsequences=(); print "\n\nNext, we define the top and bottom strands relative to a defined position that marks the end of the linker. In this case position 205 is the end of the linker. The top strand extends from the beginning of the overall sequence to position 205:"; $topstrand=substr($sequence, 0, 205); print "\n\n$topstrand"; print "\n\nWe define the bottom strand as the substring between position 205 and the end of the whole sequence. (Note that the end of the sequence is length-1 since Perl calls the charcter in the first position 0, not 1):"; $length=length ($sequence); $bottomstrand=substr($sequence, 205, ($length-1)); print "\n\n$bottomstrand"; print "\n\nWe definte the barcode as the substring of the top strand between the beginning of the barcode (pos 198) extending for 8 positions"; $barcode=substr($topstrand, 198, 8); print"\n\n$barcode"; print "\n\nWe will do the change of the barcode to lowercase now, which is accomplished by first defining a second barcode substring which is the same as the original barcode, this one is called newbarcode. Then we change newbarcode to lowercase."; $newbarcode=substr($topstrand, 198, 8); $newbarcode=~tr/TAG/tag/; print "\n\nHere is the new barcode:$newbarcode"; print "\n\nNow we insert into topstrand the newbarcode string in the position 198 extending for 8 positions"; substr($topstrand, 198, 8) = $newbarcode; print "\n\nHere is the topstrand with the edited barcode:$topstrand"; print "\n\nNext, we reverse the order of the top strand:"; $topstrand=reverse $topstrand; print "\n\n$topstrand"; print "\n\nThen, we substitute D in for C in the top strand, and E in for C in the bottom strand. Here are the old strands:\n\n$topstrand\n\n$bottomstrand"; $topstrand=~s/C/D/g; $bottomstrand=~s/C/E/g; print "\n\nAnd here are the new strands:\n\n$topstrand\n\n$bottomstrand"; print "\n\nThe difficult part now is to replace all Ts at CpG sites with a different letter. We have to query each individual CpG site and ask whether this position is a T or not (if not, presumably it is a C that we have turned into a D or E). If the position is a T, then we replace this with a different letter. We start with the bottom strand, and replace all CpG site Ts with Vs. \n\n"; $isthisaT=substr($bottomstrand, 33, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 33, 1) = 'V'}; $isthisaT=substr($bottomstrand, 45, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 45, 1) = 'V'}; $isthisaT=substr($bottomstrand, 47, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 47, 1) = 'V'}; $isthisaT=substr($bottomstrand, 57, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 57, 1) = 'V'}; $isthisaT=substr($bottomstrand, 68, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 68, 1) = 'V'}; $isthisaT=substr($bottomstrand, 84, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 84, 1) = 'V'}; $isthisaT=substr($bottomstrand, 100, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 100, 1) = 'V'}; $isthisaT=substr($bottomstrand, 102, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 102, 1) = 'V'}; $isthisaT=substr($bottomstrand, 106, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 106, 1) = 'V'}; $isthisaT=substr($bottomstrand, 110, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 110, 1) = 'V'}; $isthisaT=substr($bottomstrand, 115, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 115, 1) = 'V'}; $isthisaT=substr($bottomstrand, 117, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 117, 1) = 'V'}; $isthisaT=substr($bottomstrand, 119, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 119, 1) = 'V'}; $isthisaT=substr($bottomstrand, 130, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 130, 1) = 'V'}; $isthisaT=substr($bottomstrand, 135, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 135, 1) = 'V'}; $isthisaT=substr($bottomstrand, 143, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 143, 1) = 'V'}; $isthisaT=substr($bottomstrand, 147, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 147, 1) = 'V'}; $isthisaT=substr($bottomstrand, 160, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 160, 1) = 'V'}; $isthisaT=substr($bottomstrand, 162, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 162, 1) = 'V'}; $isthisaT=substr($bottomstrand, 168, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 168, 1) = 'V'}; $isthisaT=substr($bottomstrand, 170, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 170, 1) = 'V'}; $isthisaT=substr($bottomstrand, 172, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($bottomstrand, 172, 1) = 'V'}; print "Now we do the same in the top strand, replacing all of the Ts at CpG sites with Us.\n\n"; $isthisaT=substr($topstrand, 39, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 39, 1) = 'U'}; $isthisaT=substr($topstrand, 51, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 51, 1) = 'U'}; $isthisaT=substr($topstrand, 53, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 53, 1) = 'U'}; $isthisaT=substr($topstrand, 63, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 63, 1) = 'U'}; $isthisaT=substr($topstrand, 74, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 74, 1) = 'U'}; $isthisaT=substr($topstrand, 90, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 90, 1) = 'U'}; $isthisaT=substr($topstrand, 106, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 106, 1) = 'U'}; $isthisaT=substr($topstrand, 108, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 108, 1) = 'U'}; $isthisaT=substr($topstrand, 112, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 112, 1) = 'U'}; $isthisaT=substr($topstrand, 116, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 116, 1) = 'U'}; $isthisaT=substr($topstrand, 121, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 121, 1) = 'U'}; $isthisaT=substr($topstrand, 123, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 123, 1) = 'U'}; $isthisaT=substr($topstrand, 125, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 125, 1) = 'U'}; $isthisaT=substr($topstrand, 136, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 136, 1) = 'U'}; $isthisaT=substr($topstrand, 141, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 141, 1) = 'U'}; $isthisaT=substr($topstrand, 149, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 149, 1) = 'U'}; $isthisaT=substr($topstrand, 153, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 153, 1) = 'U'}; $isthisaT=substr($topstrand, 166, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 166, 1) = 'U'}; $isthisaT=substr($topstrand, 168, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 168, 1) = 'U'}; $isthisaT=substr($topstrand, 174, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 174, 1) = 'U'}; $isthisaT=substr($topstrand, 176, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 176, 1) = 'U'}; $isthisaT=substr($topstrand, 178, 1); print "Here is the CpG site in question: $isthisaT\n\n"; if($isthisaT eq T) {substr($topstrand, 178, 1) = 'U'}; print "Let's see if these changes occurred. Here's the new top strand:\n\n$topstrand"; print "\n\nAnd here is the new bottom strand:\n\n$bottomstrand"; print "\n\nSince we ultimately want to color code the C/T and the adjoining G in the CpG dyads, we must replace the Gs within each dyad with a different letter. So, in the top strand we replace DG with DH, and UG with UH. In the bottom strand, we replace GE with HE and GV with HV."; $topstrand=~s/DG/DH/g; $bottomstrand=~s/GE/HE/g; $topstrand=~s/UG/UH/g; $bottomstrand=~s/GV/HV/g; print "\n\nWe also add a line break and seven hyphens at the beginning of the bottom strand, which will land it on a line below the top strand and align it correctly to make up for the extra 7 nts in the top stand barcode."; substr($bottomstrand, 0, 0)="\n-------"; print "\n\nFinally, we have our finished top and bottom strands..."; print "\n\n$topstrand"; print "\n$bottomstrand"; @thissequence=($topstrand, $bottomstrand); print "\n\nAnd then we have the array that contains the separated top and bottom strands for this sequence:"; print "\n\n@thissequence"; $topstrand=~ s/^(\s+)//g; print "\n\nNow, as the final step in the individual sequence loop, we put the seqence name, top strand, and bottom strand, into the finished array.\n\n************************************"; push (@finished, "\n\n@names[$counter]"); push (@finished,$topstrand); push (@finished,$bottomstrand); $counter++;} print "\n\n\n\nHere is the finished array that we will copy the text from and use to replace the altered letters with color coding is MS Word:\n\n@finished\n\n"; exit;