Source code for count_reads_per_region

# Copyright (c) 2011 University of Washington. All rights reserved.

#!/usr/bin/python

# count_reads_per_region.py

import sys, os, math, pysam
import cPickle as pickle
from NativeReadCounter import NativeReadCounter
from ReverseComplementReadCounter import ReverseComplementReadCounter

def get_strand(strd_int):
    return ('top' if strd_int == 1 else 'bottom')


def format_profile_line(name, locus_type, tx_type, read_count_raw, total_read_count, length, gene='', product=''):
    """
    """

    read_count_pM = (read_count_raw * (10 ** 6)) / float(total_read_count)
    read_count_pKM = ((read_count_pM * (10 ** 3)) / float(length) if length != 0 else float('Infinity'))

    if locus_type == 'gene' and (tx_type == 'sense' or tx_type == 'genic'):
        return '{0:9} {1:5d}, {2:7} {3:7d}, {4:7.2f}, {5:7.2f},\t{6:5} "{7}"'.format(name + ',', length, tx_type + ',', read_count_raw, read_count_pM, read_count_pKM, gene + ',', product)
    else:
        return '{0:9} {1:5d}, {2:7} {3:7d}, {4:7.2f}, {5:7.2f}'.format(name + ',', length, tx_type + ',', read_count_raw, read_count_pM, read_count_pKM)


def generate_profile(feature, locus_type, length, counter, total_read_count):
    """
    """

    profile_line = ''

    if counter.with_strand_specificity:
        if locus_type == 'intergene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'igpos', counter.aligned_read_count['igpos'], total_read_count, length) \
                 + '\n' + \
                format_profile_line(feature['locus'], locus_type, 'igneg', counter.aligned_read_count['igneg'], total_read_count, length)

        elif locus_type == 'gene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'sense', counter.aligned_read_count['sense'], total_read_count, length, feature['gene'], feature['product']) \
                 + '\n' + \
                format_profile_line(feature['locus'], locus_type, 'antis', counter.aligned_read_count['antis'], total_read_count, length)
            
    else:
        if locus_type == 'intergene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'inter', counter.aligned_read_count['inter'], total_read_count, length)

        elif locus_type == 'gene':
            profile_line = \
                format_profile_line(feature['locus'], locus_type, 'genic', counter.aligned_read_count['genic'], total_read_count, length, feature['gene'], feature['product'])

    return profile_line


def generate_null_intergene_profile(feature, locus_type, length, counter, total_read_count):
    """
    """

    profile_line = ''
    length = int(math.fabs(length))
    
    if counter.with_strand_specificity:
        profile_line = \
            format_profile_line(feature['locus'], locus_type, 'igpos', 0, total_read_count, length) \
             + '\n' + \
            format_profile_line(feature['locus'], locus_type, 'igneg', 0, total_read_count, length)
    else:
        profile_line = \
            format_profile_line(feature['locus'], locus_type, 'inter', 0, total_read_count, length)
    
    return profile_line


def remove_out_of_region_data(start, end, aligned_position_list):
    """
    """
    return [position for position in aligned_position_list if position >= start and position <= end]


def remove_surplus(locus_type, start, end, counter):
    """
    """
    if counter.with_strand_specificity:
        if locus_type == 'intergene':
            for tx_type in ['igneg', 'igpos']:
                counter.aligned_position_list[tx_type] = remove_out_of_region_data(start, end, counter.aligned_position_list[tx_type])
                counter.aligned_read_count[tx_type] = len(counter.aligned_position_list[tx_type])
        
        elif locus_type == 'gene':
            for tx_type in ['sense', 'antis']:
                counter.aligned_position_list[tx_type] = remove_out_of_region_data(start, end, counter.aligned_position_list[tx_type])
                counter.aligned_read_count[tx_type] = len(counter.aligned_position_list[tx_type])

    else:
        if locus_type == 'intergene':
            counter.aligned_position_list['inter'] = remove_out_of_region_data(start, end, counter.aligned_position_list['inter'])
            counter.aligned_read_count['inter'] = len(counter.aligned_position_list['inter'])

        elif locus_type == 'gene':
            counter.aligned_position_list['genic'] = remove_out_of_region_data(start, end, counter.aligned_position_list['genic'])
            counter.aligned_read_count['genic'] = len(counter.aligned_position_list['genic'])


# ---------------------------------------------------------------

def store_read_count_per_position(type_specific_visual_data, type_specific_aligned_position_list, locus_type):
    """
    """
    
    global possible_double_counting_position

    for position in set(type_specific_aligned_position_list):
        if position not in type_specific_visual_data.keys():
            type_specific_visual_data[position] = type_specific_aligned_position_list.count(position)
        else:
            possible_double_counting_position[locus_type][position] = type_specific_aligned_position_list.count(position)


def generate_visual_data(locus_type, counter, visual_data):
    """
    """

    if counter.with_strand_specificity:
        if locus_type == 'intergene':
            store_read_count_per_position(visual_data['igpos'], counter.aligned_position_list['igpos'], 'igpos')
            store_read_count_per_position(visual_data['igneg'], counter.aligned_position_list['igneg'], 'ignet')

        elif locus_type == 'gene':
            store_read_count_per_position(visual_data['sense'], counter.aligned_position_list['sense'], 'sense')
            store_read_count_per_position(visual_data['antis'], counter.aligned_position_list['antis'], 'antis')

    else:
        if locus_type == 'intergene':
            store_read_count_per_position(visual_data['inter'], counter.aligned_position_list['inter'], 'inter')

        elif locus_type == 'gene':
            store_read_count_per_position(visual_data['genic'], counter.aligned_position_list['genic'], 'genic')

    return


def normalize_visual_data_to_RPM(visual_data, total_read_count):
    """
    """

    for type in visual_data.keys():
        for position in visual_data[type].keys():
            read_count_per_position_per_million = (visual_data[type][position] * (10.0 ** 6)) / total_read_count
            visual_data[type][position] = read_count_per_position_per_million

    return


# ---------------------------------------------------------------

 
[docs]def count_reads_per_region(sample_name, alignment_file, read_length, read_seq_direction, is_strand_specific, reference_name, reference_annotation_file, slice_start, slice_end, total_uniquely_aligned_reads, output_dir):
    """Count reads over `reference_annotation_file`, print expression profile to sys.stdout, and store a visualisation of this data.

    `pysam <http://www.cgat.org/~andreas/documentation/pysam/api.html>`_ is a python wrapper of 
    `samtools <http://samtools.sourceforge.net/>`_ which is used to count reads mapped to the loci 
    defined by `reference_genbank` file. 

    Loci types depend on whether the RNA-seq method `is_strand_specific`. For strand-preserving 
    methods = ``yes``:

        ``sense``, ``antis``: genic loci located on top or bottom strand, respectively  

        ``igpos``, ``igneg``: intergenic loci on top or bottom strand, respectively

    For these methods, the result of bench-work steps and sequencing can cause the 
    `read_seq_direction` to be reported as ``native`` or ``reverse_complement`` relative to the 
    top-strand of the reference genome. 
    
    If strandedness is not maintained -- `is_strand_specific` = ``no`` -- loci are reported as 
    ``genic`` or ``inter``.

   
    Genic regions are defined as being between the coordinates of a feature extracted from the 
    genbank_file by :func:`create_gene_annotation_pickle`. Intergenic regions are the spaces between
    these, and are assigned to a locus as being adjacent to the 5' end of a gene.
    
    Raw counts are generated, defined by the location of the 5'-end of each primer. 

    .. note: 
        
        If a read overlaps both an intergenic and a genic locus it is discarded and not counted 
        for either.

    Read counts are reported as :term:`raw`, :term:`pM`, and :term:`pKM`, the latter two being 
    normalised by the total 'unique' reads reported by :func:`count_mapping` during Step 2. 

    """

    # ------------------------- #
    # INITIALIZE VARIABLES
    # ------------------------- #
    slice_start, slice_end = int(slice_start), int(slice_end)
    total_uniquely_aligned_reads = int(total_uniquely_aligned_reads)

    samfile = pysam.Samfile(alignment_file, "rb")

    visual_data = {}

    global possible_double_counting_position

    if is_strand_specific.lower() == 'yes':
        mode_strand_specificity =  True
        visual_data = {'igpos': {}, 'igneg': {}, 'sense': {}, 'antis': {}}
        possible_double_counting_position = {'igpos': {}, 'igneg': {}, 'sense': {}, 'antis': {}}

    elif is_strand_specific.lower() == 'no':
        mode_strand_specificity = False
        visual_data = {'inter': {}, 'genic': {}}
        possible_double_counting_position = {'inter': {}, 'genic': {}}

    else:
        sys.exit('ERROR: %s is not supported!' % mode_strand_specificity)

    if os.path.isfile(reference_annotation_file):
        all_features = pickle.load(open(reference_annotation_file))
        if not all_features:
            sys.exit('ERROR: %s is not found in %s!' % (reference_name, reference_annotation_file))
        if slice_start == 0:
            prv_gene_start, prv_gene_end = 0, 0
        else:   
            prv_gene_start = all_features[slice_start - 1]['start']
            prv_gene_end   = all_features[slice_start - 1]['end']

        feature_list_slice = all_features[slice_start : slice_end]

    else:
        sys.exit('ERROR: %s is not found!' % reference_annotation_file)

    # print header
    if slice_start == 0: 
        print '{0:>8} {1:>7} {2:>6} {3:>9} {4:>8} {5:>8}\t{6:>5} {7}'.format('locus,', 'size,', 
                'type,', 'raw,', 'pM,', 'pKM,', 'gene,', 'product') 

    for feature in feature_list_slice:

        # 1) [gene] boundary is inclusive
        cur_gene_start = feature['start']
        cur_gene_end = feature['end']
        cur_gene_length = (cur_gene_end - cur_gene_start) + 1

        # 2) (intergenic) boundary is exclusive
        intergene_start = prv_gene_end + 1
        intergene_end = cur_gene_start - 1
        intergene_length = intergene_end - intergene_start + 1

        # 3) fetch reads aligned to intergene first
        counter = None
        if read_seq_direction == 'native':
            counter = NativeReadCounter(mode_strand_specificity)
        elif read_seq_direction == 'reverse_complement':
            counter = ReverseComplementReadCounter(mode_strand_specificity)

        counter.this_feature_type = 'intergene'
        counter.this_feature_strand = get_strand(feature['strand'])
        try:
            """
               .. note: 
            
                   Add (read length - 2) to all intergene/gene start because:

                   "...Fetching returns all reads overlapping a region sorted by the lowest aligned 
                   base in the reference sequence. Note that it will also return reads that are only 
                   partially overlapping with the region. Thus the reads returned might span a 
                   region that is larger than the one queried..."
            """

            if (intergene_end - intergene_start) >= read_length:
                samfile.fetch(reference_name, intergene_start + (read_length - 2), 
                                intergene_end, callback = counter)

            # need to change the fetching-start index 
            # pysam return errors if {intergene_start + (read_length - 2)} > {intergene_end}
            else: 
                samfile.fetch(reference_name, intergene_start, intergene_end, callback = counter)
            
            remove_surplus('intergene', intergene_start, intergene_end, counter)
            print generate_profile(feature, 'intergene', intergene_length, counter, 
                                    total_uniquely_aligned_reads)
            generate_visual_data('intergene', counter, visual_data)

        except ValueError:
            print generate_null_intergene_profile(feature, 'intergene', intergene_length, counter, 
                                                    total_uniquely_aligned_reads)

        
        # 4) then fetch reads aligned to gene
        counter = None
        if read_seq_direction == 'native':
            counter = NativeReadCounter(mode_strand_specificity)
        elif read_seq_direction == 'reverse_complement':
            counter = ReverseComplementReadCounter(mode_strand_specificity)
        
        counter.this_feature_type = 'gene'
        counter.this_feature_strand = get_strand(feature['strand'])

        """
        Note: Didn't put a try block here because don't expect to have a gene with length == 0 or 1
        """

        if cur_gene_end - cur_gene_start >= read_length:
            samfile.fetch(reference_name, cur_gene_start + (read_length - 2), cur_gene_end,                                     callback = counter)

        else: # see explanation for the 'else' above
            samfile.fetch(reference_name, cur_gene_start, cur_gene_end, callback = counter)

        remove_surplus('gene', cur_gene_start, cur_gene_end, counter)
        print generate_profile(feature, 'gene', cur_gene_length, counter, 
                                total_uniquely_aligned_reads)
        generate_visual_data('gene', counter, visual_data)

        prv_gene_start, prv_gene_end = cur_gene_start, cur_gene_end


    substituted_reference_name = reference_name.replace('|', '_')
    substituted_reference_name = substituted_reference_name.replace('.', '_')
    if substituted_reference_name[-1] == '_':
        substituted_reference_name = substituted_reference_name[:-1]

    normalize_visual_data_to_RPM(visual_data, total_uniquely_aligned_reads)
    with file('%s/%s.%s.rPM.visual.pickle' % (output_dir, sample_name, substituted_reference_name),
                'w') as normalized_visual_data_file:
        pickle.dump(visual_data, normalized_visual_data_file)

    return 0

if __name__ == '__main__':
    #f = count_reads_per_region.func_code
    #if len(sys.argv) != f.co_argcount:
    #    sys.exit('USAGE: %s %s' % (sys.argv[0], ' '.join(f.co_varnames[:f.co_argcount])))
    sys.exit(count_reads_per_region(*sys.argv[1:]))
Navigation

Source code for count_reads_per_region

Quick search

Navigation