Source code for create_gene_annotation_pickle

# Copyright (c) 2011 University of Washington. All rights reserved.

#!/usr/bin/python

import sys, cPickle as pickle
from Bio import SeqIO

[docs]def store_feature_info(feature, all_feature_list): """ """ locus = feature.qualifiers["locus_tag"][0] strand = feature.strand # change the start position from Python-style (begin with 0) to chromosome-style (begin with 1) indexing start = feature.location.start.position + 1 # don't need to change the end position end = feature.location.end.position gene = "n/a" if "gene" in feature.qualifiers.keys(): gene = feature.qualifiers["gene"][0] product = "n/a" if "product" in feature.qualifiers.keys(): product = feature.qualifiers["product"][0] all_feature_list.append({'locus': locus, \ 'strand': strand, \ 'start': start, \ 'end': end, \ 'gene': gene, \ 'product': product})
[docs]def create_gene_annotation_pickle(in_genbank_file_name, in_pickle_file_name): """Create list of dictionaries from genbank loci entries and store as pickle. The reference name included as part of `in_pickle_file_name` is used to search records in `in_genbank_file_name`. Any record included in the pickle file name will be included in the pickle. Entry types included are: 'CDS', 'tRNA', 'rRNA', 'misc_RNA', 'tmRNA', and 'ncRNA'. This script is intended to create a single pickle for each reference that may be contained in the Genbank file. For instance, if the genome is made up of multiple contigs, for each contig, corresponding to its fasta entry name, a pickle file will be created. The filename format could be: genome.contigN.gb.annotation.pickle. """ #all_feature_list = {} with SeqIO.parse(open(in_genbank_file_name, 'rU'), 'genbank') as seq_records: all_feature_list = [] for seq_record in seq_records: # look for specific reference name as part of annotation pickle if seq_record.name in in_pickle_file_name: for feature in seq_record.features: if feature.type in ['CDS', 'tRNA', 'rRNA', 'misc_RNA', 'tmRNA', 'ncRNA']: store_feature_info(feature, all_feature_list) break if len(all_feature_list) == 0: sys.exit('ERROR: given genbank appears to contain no loci information') else: sys.stderr.write('...read %d entries from %s\n' % (len(all_feature_list), in_genbank_file_name)) with file(in_pickle_file_name, 'w') as out_pickle_file: pickle.dump(all_feature_list, out_pickle_file)
if __name__ == '__main__': if len(sys.argv) != 3: sys.exit('USAGE: ' + sys.argv[0] + ' in_genbank_filename in_pickle_filename') create_gene_annotation_pickle(*sys.argv[1:])