Source code for count_reads_per_region

# Copyright (c) 2011 University of Washington. All rights reserved.

#!/usr/bin/python

# count_reads_per_region.py

import sys, os, math, pysam
import cPickle as pickle
from NativeReadCounter import NativeReadCounter
from ReverseComplementReadCounter import ReverseComplementReadCounter

def get_strand(strd_int):
    return ('top' if strd_int == 1 else 'bottom')


def format_profile_line(name, locus_type, tx_type, read_count_raw, total_read_count, length, gene='', product=''):
    """
    """

    read_count_pM = (read_count_raw * (10 ** 6)) / float(total_read_count)
    read_count_pKM = ((read_count_pM * (10 ** 3)) / float(length) if length != 0 else float('Infinity'))

    if locus_type == 'gene' and (tx_type == 'sense' or tx_type == 'genic'):
        return '{0:9} {1:5d}, {2:7} {3:7d}, {4:7.2f}, {5:7.2f},\t{6:5} "{7}"'.format(name + ',', length, tx_type + ',', read_count_raw, read_count_pM, read_count_pKM, gene + ',', product)
    else:
        return '{0:9} {1:5d}, {2:7} {3:7d}, {4:7.2f}, {5:7.2f}'.format(name + ',', length, tx_type + ',', read_count_raw, read_count_pM, read_count_pKM)


def generate_profile(feature, locus_type, length, counter, total_read_count):
    """
    """

    profile_line = ''

    if counter.with_strand_specificity:
        if locus_type == 'intergene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'igpos', counter.aligned_read_count['igpos'], total_read_count, length) \
                 + '\n' + \
                format_profile_line(feature['locus'], locus_type, 'igneg', counter.aligned_read_count['igneg'], total_read_count, length)

        elif locus_type == 'gene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'sense', counter.aligned_read_count['sense'], total_read_count, length, feature['gene'], feature['product']) \
                 + '\n' + \
                format_profile_line(feature['locus'], locus_type, 'antis', counter.aligned_read_count['antis'], total_read_count, length)
            
    else:
        if locus_type == 'intergene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'inter', counter.aligned_read_count['inter'], total_read_count, length)

        elif locus_type == 'gene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'genic', counter.aligned_read_count['genic'], total_read_count, length, feature['gene'], feature['product'])

    return profile_line


def generate_null_intergene_profile(feature, locus_type, length, counter, total_read_count):
    """
    """

    profile_line = ''
    length = int(math.fabs(length))
    
    if counter.with_strand_specificity:
        profile_line = \
            format_profile_line(feature['locus'], locus_type, 'igpos', 0, total_read_count, length) \
             + '\n' + \
            format_profile_line(feature['locus'], locus_type, 'igneg', 0, total_read_count, length)
    else:
        profile_line = \
            format_profile_line(feature['locus'], locus_type, 'inter', 0, total_read_count, length)
    
    return profile_line


def remove_out_of_region_data(start, end, aligned_position_list):
    """
    """
    return [position for position in aligned_position_list if position >= start and position <= end]


def remove_surplus(locus_type, start, end, counter):
    """
    """
    if counter.with_strand_specificity:
        if locus_type == 'intergene':
            for tx_type in ['igneg', 'igpos']:
                counter.aligned_position_list[tx_type] = remove_out_of_region_data(start, end, counter.aligned_position_list[tx_type])
                counter.aligned_read_count[tx_type] = len(counter.aligned_position_list[tx_type])
        
        elif locus_type == 'gene':
            for tx_type in ['sense', 'antis']:
                counter.aligned_position_list[tx_type] = remove_out_of_region_data(start, end, counter.aligned_position_list[tx_type])
                counter.aligned_read_count[tx_type] = len(counter.aligned_position_list[tx_type])

    else:
        if locus_type == 'intergene':
            counter.aligned_position_list['inter'] = remove_out_of_region_data(start, end, counter.aligned_position_list['inter'])
            counter.aligned_read_count['inter'] = len(counter.aligned_position_list['inter'])

        elif locus_type == 'gene':
            counter.aligned_position_list['genic'] = remove_out_of_region_data(start, end, counter.aligned_position_list['genic'])
            counter.aligned_read_count['genic'] = len(counter.aligned_position_list['genic'])


# ---------------------------------------------------------------

def store_read_count_per_position(type_specific_visual_data, type_specific_aligned_position_list, locus_type):
    """
    """
    
    global possible_double_counting_position

    for position in set(type_specific_aligned_position_list):
        if position not in type_specific_visual_data.keys():
            type_specific_visual_data[position] = type_specific_aligned_position_list.count(position)
        else:
            possible_double_counting_position[locus_type][position] = type_specific_aligned_position_list.count(position)


def generate_visual_data(locus_type, counter, visual_data):
    """
    """

    if counter.with_strand_specificity:
        if locus_type == 'intergene':
            store_read_count_per_position(visual_data['igpos'], counter.aligned_position_list['igpos'], 'igpos')
            store_read_count_per_position(visual_data['igneg'], counter.aligned_position_list['igneg'], 'ignet')

        elif locus_type == 'gene':
            store_read_count_per_position(visual_data['sense'], counter.aligned_position_list['sense'], 'sense')
            store_read_count_per_position(visual_data['antis'], counter.aligned_position_list['antis'], 'antis')

    else:
        if locus_type == 'intergene':
            store_read_count_per_position(visual_data['inter'], counter.aligned_position_list['inter'], 'inter')

        elif locus_type == 'gene':
            store_read_count_per_position(visual_data['genic'], counter.aligned_position_list['genic'], 'genic')

    return


def normalize_visual_data_to_RPM(visual_data, total_read_count):
    """
    """

    for type in visual_data.keys():
        for position in visual_data[type].keys():
            read_count_per_position_per_million = (visual_data[type][position] * (10.0 ** 6)) / total_read_count
            visual_data[type][position] = read_count_per_position_per_million

    return


# ---------------------------------------------------------------

 
[docs]def count_reads_per_region(sample_name, alignment_file, read_length, read_seq_direction, is_strand_specific, reference_name, reference_annotation_file, slice_start, slice_end, total_uniquely_aligned_reads, output_dir): """Count reads over `reference_annotation_file`, print expression profile to sys.stdout, and store a visualisation of this data. `pysam <http://www.cgat.org/~andreas/documentation/pysam/api.html>`_ is a python wrapper of `samtools <http://samtools.sourceforge.net/>`_ which is used to count reads mapped to the loci defined by `reference_genbank` file. Loci types depend on whether the RNA-seq method `is_strand_specific`. For strand-preserving methods = ``yes``: ``sense``, ``antis``: genic loci located on top or bottom strand, respectively ``igpos``, ``igneg``: intergenic loci on top or bottom strand, respectively For these methods, the result of bench-work steps and sequencing can cause the `read_seq_direction` to be reported as ``native`` or ``reverse_complement`` relative to the top-strand of the reference genome. If strandedness is not maintained -- `is_strand_specific` = ``no`` -- loci are reported as ``genic`` or ``inter``. Genic regions are defined as being between the coordinates of a feature extracted from the genbank_file by :func:`create_gene_annotation_pickle`. Intergenic regions are the spaces between these, and are assigned to a locus as being adjacent to the 5' end of a gene. Raw counts are generated, defined by the location of the 5'-end of each primer. .. note: If a read overlaps both an intergenic and a genic locus it is discarded and not counted for either. Read counts are reported as :term:`raw`, :term:`pM`, and :term:`pKM`, the latter two being normalised by the total 'unique' reads reported by :func:`count_mapping` during Step 2. """ # ------------------------- # # INITIALIZE VARIABLES # ------------------------- # slice_start, slice_end = int(slice_start), int(slice_end) total_uniquely_aligned_reads = int(total_uniquely_aligned_reads) samfile = pysam.Samfile(alignment_file, "rb") visual_data = {} global possible_double_counting_position if is_strand_specific.lower() == 'yes': mode_strand_specificity = True visual_data = {'igpos': {}, 'igneg': {}, 'sense': {}, 'antis': {}} possible_double_counting_position = {'igpos': {}, 'igneg': {}, 'sense': {}, 'antis': {}} elif is_strand_specific.lower() == 'no': mode_strand_specificity = False visual_data = {'inter': {}, 'genic': {}} possible_double_counting_position = {'inter': {}, 'genic': {}} else: sys.exit('ERROR: %s is not supported!' % mode_strand_specificity) if os.path.isfile(reference_annotation_file): all_features = pickle.load(open(reference_annotation_file)) if not all_features: sys.exit('ERROR: %s is not found in %s!' % (reference_name, reference_annotation_file)) if slice_start == 0: prv_gene_start, prv_gene_end = 0, 0 else: prv_gene_start = all_features[slice_start - 1]['start'] prv_gene_end = all_features[slice_start - 1]['end'] feature_list_slice = all_features[slice_start : slice_end] else: sys.exit('ERROR: %s is not found!' % reference_annotation_file) # print header if slice_start == 0: print '{0:>8} {1:>7} {2:>6} {3:>9} {4:>8} {5:>8}\t{6:>5} {7}'.format('locus,', 'size,', 'type,', 'raw,', 'pM,', 'pKM,', 'gene,', 'product') for feature in feature_list_slice: # 1) [gene] boundary is inclusive cur_gene_start = feature['start'] cur_gene_end = feature['end'] cur_gene_length = (cur_gene_end - cur_gene_start) + 1 # 2) (intergenic) boundary is exclusive intergene_start = prv_gene_end + 1 intergene_end = cur_gene_start - 1 intergene_length = intergene_end - intergene_start + 1 # 3) fetch reads aligned to intergene first counter = None if read_seq_direction == 'native': counter = NativeReadCounter(mode_strand_specificity) elif read_seq_direction == 'reverse_complement': counter = ReverseComplementReadCounter(mode_strand_specificity) counter.this_feature_type = 'intergene' counter.this_feature_strand = get_strand(feature['strand']) try: """ .. note: Add (read length - 2) to all intergene/gene start because: "...Fetching returns all reads overlapping a region sorted by the lowest aligned base in the reference sequence. Note that it will also return reads that are only partially overlapping with the region. Thus the reads returned might span a region that is larger than the one queried..." """ if (intergene_end - intergene_start) >= read_length: samfile.fetch(reference_name, intergene_start + (read_length - 2), intergene_end, callback = counter) # need to change the fetching-start index # pysam return errors if {intergene_start + (read_length - 2)} > {intergene_end} else: samfile.fetch(reference_name, intergene_start, intergene_end, callback = counter) remove_surplus('intergene', intergene_start, intergene_end, counter) print generate_profile(feature, 'intergene', intergene_length, counter, total_uniquely_aligned_reads) generate_visual_data('intergene', counter, visual_data) except ValueError: print generate_null_intergene_profile(feature, 'intergene', intergene_length, counter, total_uniquely_aligned_reads) # 4) then fetch reads aligned to gene counter = None if read_seq_direction == 'native': counter = NativeReadCounter(mode_strand_specificity) elif read_seq_direction == 'reverse_complement': counter = ReverseComplementReadCounter(mode_strand_specificity) counter.this_feature_type = 'gene' counter.this_feature_strand = get_strand(feature['strand']) """ Note: Didn't put a try block here because don't expect to have a gene with length == 0 or 1 """ if cur_gene_end - cur_gene_start >= read_length: samfile.fetch(reference_name, cur_gene_start + (read_length - 2), cur_gene_end, callback = counter) else: # see explanation for the 'else' above samfile.fetch(reference_name, cur_gene_start, cur_gene_end, callback = counter) remove_surplus('gene', cur_gene_start, cur_gene_end, counter) print generate_profile(feature, 'gene', cur_gene_length, counter, total_uniquely_aligned_reads) generate_visual_data('gene', counter, visual_data) prv_gene_start, prv_gene_end = cur_gene_start, cur_gene_end substituted_reference_name = reference_name.replace('|', '_') substituted_reference_name = substituted_reference_name.replace('.', '_') if substituted_reference_name[-1] == '_': substituted_reference_name = substituted_reference_name[:-1] normalize_visual_data_to_RPM(visual_data, total_uniquely_aligned_reads) with file('%s/%s.%s.rPM.visual.pickle' % (output_dir, sample_name, substituted_reference_name), 'w') as normalized_visual_data_file: pickle.dump(visual_data, normalized_visual_data_file) return 0
if __name__ == '__main__': #f = count_reads_per_region.func_code #if len(sys.argv) != f.co_argcount: # sys.exit('USAGE: %s %s' % (sys.argv[0], ' '.join(f.co_varnames[:f.co_argcount]))) sys.exit(count_reads_per_region(*sys.argv[1:]))