Source code for extract_reads_from_pools

# Copyright (c) 2011 University of Washington. All rights reserved.

#!/usr/bin/python

import sys, os, os.path
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

class FastQTypeError(Exception):
    """Error describing incorrect fastq quality encoding.
       See `FASTQ-format <http://en.wikipedia.org/wiki/FASTQ_format#Encoding>`_ for further info
    """
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return repr(self.value)

[docs]def extract_reads(in_fastq_file, in_fastq_type, barcodes, bioseq_start): """Reads are filtered by sequencing quality and barcode, if any and written to disk. :fastq: Sanger, Illumina1.1-1.3,1.8+ | Phred +33 ASCII offset :fastq-illumina: Solexa, Illumina 1.3-1.7 | Phred +64 ASCII offset `FASTQ <http://en.wikipedia.org/wiki/FASTQ_format#Encoding>`_ specification changes over time, with new releases of next-gen sequencing platforms. A :func:`FastQTypeError` will be raised if the `in_fastq_type` is incorrect. The biologically relevant segment of the read is determined by the position given by `bioseq_start`. This is a 1-based position, so for a barcode length of 4, using `sample_bioseq_start` of 5 means the read will be stored starting at the 5th nucleotide in the read. For example, the following read with `barcodes` of ``AAAA,CCCC,GGGG,TTTT`` and `bioseq_start` ``5``:: 1234567 ... 36 AAAACGTACGTACGTACGTACGTACGTACGTACGTA barcode match: AAAA relevant read: CGTACGTACGTACGTACGTACGTACGTACGTA the ``total`` count for ``AAAA`` will be incremented, and if the quality is high, (ie, minimum base-wise quality score >= 20) the read will be added to the ``AAAA`` record and the ``good`` count for ``AAAA`` will be incremented. """ all_reads = {} barcode_list = [] in_fastq_name = in_fastq_file.split('/')[-1] if 'no_barcode' not in barcodes: barcode_list = [x.upper() for x in barcodes.split(',') if len(x) > 0] # barcode length is assumed to be uniform barcode_length = len(barcode_list[0]) for rec in SeqIO.parse(in_fastq_file, in_fastq_type): if not barcode_list: barcode = 'no_barcode' else: barcode = str(rec.seq[:barcode_length]) if not barcode_list or barcode in barcode_list: if barcode not in all_reads: all_reads[barcode] = {'qualified_reads': [], 'no_read_with_this_barcode' : 0, 'no_GOOD_read_with_this_barcode' : 0} all_reads[barcode]['no_read_with_this_barcode'] += 1 if min(rec.letter_annotations["phred_quality"]) >= 20: all_reads[barcode]['no_GOOD_read_with_this_barcode'] += 1 if rec.seq.find("N") != -1: # this is somewhat inefficient, but works raise FastQTypeError('The FASTQ file format is incorrect. There are sequences containg N\'s after filtering') seq_barcode = barcode if not barcode_list: seq_primer = str(rec.seq[0:6]) else: seq_primer = str(rec.seq[len(barcode):len(barcode)+6]) seq_index = all_reads[barcode]['no_GOOD_read_with_this_barcode'] seq_str = str(rec.seq[bioseq_start: ]) # store bwa archive all_reads[barcode]['qualified_reads'].append(">%s|%s-%i|%s\n%s\n" % (seq_barcode, in_fastq_name, seq_index, seq_primer, seq_str)) return all_reads
def print_read_counts(all_reads): """Print read counts for each barcodes in `all_reads` to stdout. """ for barcode in all_reads: sys.stdout.write("--barcode=%s\n--total_read=%i\n--good_read=%i\n--bad_read=%i\n" % ( (barcode, all_reads[barcode]['no_read_with_this_barcode'], all_reads[barcode]['no_GOOD_read_with_this_barcode'], all_reads[barcode]['no_read_with_this_barcode'] - \ all_reads[barcode]['no_GOOD_read_with_this_barcode'])))
[docs]def write_results(all_reads, out_fasta_file): """Write reads for each barcodes in `all_reads` to `out_fasta_file`. """ for barcode in all_reads: head, tail = os.path.split(out_fasta_file) if not os.path.isdir(head + '/' + barcode): os.mkdir(head + '/' + barcode) out_file = '%s/%s/%s_%s' % (head, barcode, barcode, tail) with open(out_file, "w") as out_fasta_handle: out_fasta_handle.writelines(all_reads[barcode]['qualified_reads'])
[docs]def extract_reads_from_pools(in_fastq_file, in_fastq_type, barcodes, bioseq_start, out_fasta_file, verbose='no'): """Run extract_reads_from_pools as a script. Briefly, reads in `in_fastq_file` are filtered for quality and separated by barcode by :func:`extract_reads`, these results written to disk by :func:`write_results`, and read counts are printed to stdout by :func:`print_read_counts` to be collected by the calling function. """ # To adjust from user's 1-base indexing to 0-base indexing bioseq_start = int(bioseq_start) - 1 if int(bioseq_start) > 0 else 0 try: all_reads = extract_reads(in_fastq_file, in_fastq_type, barcodes, bioseq_start) print_read_counts(all_reads) write_results(all_reads, out_fasta_file) except FastQTypeError as err: if not verbose or 'no' in verbose: sys.exit(err) # returns 1 else: raise return 0
if __name__ == '__main__': if len(sys.argv) != 7: sys.exit("USAGE: in_FASTQ, barcode_to_be_extracted, postion_of_biological_seq, out_FASTA, verbose") sys.exit(extract_reads_from_pools(*sys.argv[1:]))