# Copyright (c) 2011 University of Washington. All rights reserved.
#!/usr/bin/python
import sys, os, os.path
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
class FastQTypeError(Exception):
"""Error describing incorrect fastq quality encoding.
See `FASTQ-format <http://en.wikipedia.org/wiki/FASTQ_format#Encoding>`_ for further info
"""
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
[docs]def extract_reads(in_fastq_file, in_fastq_type, barcodes, bioseq_start):
"""Reads are filtered by sequencing quality and barcode, if any and written to disk.
:fastq: Sanger, Illumina1.1-1.3,1.8+ | Phred +33 ASCII offset
:fastq-illumina: Solexa, Illumina 1.3-1.7 | Phred +64 ASCII offset
`FASTQ <http://en.wikipedia.org/wiki/FASTQ_format#Encoding>`_ specification changes over
time, with new releases of next-gen sequencing platforms. A :func:`FastQTypeError` will be
raised if the `in_fastq_type` is incorrect.
The biologically relevant segment of the read is determined by the position given by
`bioseq_start`. This is a 1-based position, so for a barcode length of 4, using
`sample_bioseq_start` of 5 means the read will be stored starting at the 5th nucleotide in
the read.
For example, the following read with `barcodes` of ``AAAA,CCCC,GGGG,TTTT`` and
`bioseq_start` ``5``::
1234567 ... 36
AAAACGTACGTACGTACGTACGTACGTACGTACGTA
barcode match: AAAA
relevant read: CGTACGTACGTACGTACGTACGTACGTACGTA
the ``total`` count for ``AAAA`` will be incremented, and if the quality is high,
(ie, minimum base-wise quality score >= 20) the read will be added to the ``AAAA`` record and
the ``good`` count for ``AAAA`` will be incremented.
"""
all_reads = {}
barcode_list = []
in_fastq_name = in_fastq_file.split('/')[-1]
if 'no_barcode' not in barcodes:
barcode_list = [x.upper() for x in barcodes.split(',') if len(x) > 0]
# barcode length is assumed to be uniform
barcode_length = len(barcode_list[0])
for rec in SeqIO.parse(in_fastq_file, in_fastq_type):
if not barcode_list:
barcode = 'no_barcode'
else:
barcode = str(rec.seq[:barcode_length])
if not barcode_list or barcode in barcode_list:
if barcode not in all_reads:
all_reads[barcode] = {'qualified_reads': [],
'no_read_with_this_barcode' : 0,
'no_GOOD_read_with_this_barcode' : 0}
all_reads[barcode]['no_read_with_this_barcode'] += 1
if min(rec.letter_annotations["phred_quality"]) >= 20:
all_reads[barcode]['no_GOOD_read_with_this_barcode'] += 1
if rec.seq.find("N") != -1: # this is somewhat inefficient, but works
raise FastQTypeError('The FASTQ file format is incorrect. There are sequences containg N\'s after filtering')
seq_barcode = barcode
if not barcode_list:
seq_primer = str(rec.seq[0:6])
else:
seq_primer = str(rec.seq[len(barcode):len(barcode)+6])
seq_index = all_reads[barcode]['no_GOOD_read_with_this_barcode']
seq_str = str(rec.seq[bioseq_start: ])
# store bwa archive
all_reads[barcode]['qualified_reads'].append(">%s|%s-%i|%s\n%s\n" % (seq_barcode, in_fastq_name, seq_index, seq_primer, seq_str))
return all_reads
def print_read_counts(all_reads):
"""Print read counts for each barcodes in `all_reads` to stdout.
"""
for barcode in all_reads:
sys.stdout.write("--barcode=%s\n--total_read=%i\n--good_read=%i\n--bad_read=%i\n" % (
(barcode,
all_reads[barcode]['no_read_with_this_barcode'],
all_reads[barcode]['no_GOOD_read_with_this_barcode'],
all_reads[barcode]['no_read_with_this_barcode'] - \
all_reads[barcode]['no_GOOD_read_with_this_barcode'])))
[docs]def write_results(all_reads, out_fasta_file):
"""Write reads for each barcodes in `all_reads` to `out_fasta_file`.
"""
for barcode in all_reads:
head, tail = os.path.split(out_fasta_file)
if not os.path.isdir(head + '/' + barcode):
os.mkdir(head + '/' + barcode)
out_file = '%s/%s/%s_%s' % (head, barcode, barcode, tail)
with open(out_file, "w") as out_fasta_handle:
out_fasta_handle.writelines(all_reads[barcode]['qualified_reads'])
[docs]def extract_reads_from_pools(in_fastq_file, in_fastq_type, barcodes, bioseq_start, out_fasta_file, verbose='no'):
"""Run extract_reads_from_pools as a script.
Briefly, reads in `in_fastq_file` are filtered for quality and separated by barcode by
:func:`extract_reads`, these results written to disk by :func:`write_results`, and read
counts are printed to stdout by :func:`print_read_counts` to be collected by the calling
function.
"""
# To adjust from user's 1-base indexing to 0-base indexing
bioseq_start = int(bioseq_start) - 1 if int(bioseq_start) > 0 else 0
try:
all_reads = extract_reads(in_fastq_file, in_fastq_type, barcodes, bioseq_start)
print_read_counts(all_reads)
write_results(all_reads, out_fasta_file)
except FastQTypeError as err:
if not verbose or 'no' in verbose:
sys.exit(err) # returns 1
else:
raise
return 0
if __name__ == '__main__':
if len(sys.argv) != 7:
sys.exit("USAGE: in_FASTQ, barcode_to_be_extracted, postion_of_biological_seq, out_FASTA, verbose")
sys.exit(extract_reads_from_pools(*sys.argv[1:]))