Source code for Xpression

#!/usr/bin/python

# Copyright (c) 2011 University of Washington. All rights reserved.

# Xpression.py

from Bio import SeqIO
from StatFileWriter import write_stat_file

import sys, os, os.path, subprocess, pysam, glob, pipes
import cPickle as pickle

from subprocess import Popen, CalledProcessError

# ----Command-line option parsing -------
from optparse import OptionParser, OptionGroup, TitledHelpFormatter
from argValidate import validate_arg
#----------------------------------------

# ----Xpression GUI----------------------
from time import strftime
from processTracker import processTracker
#----------------------------------------

# see os.path.samefile, join for all that path['home/'] + '/business/'

# ----Other------------------------------
# import tempfile
# ---------------------------------------

# ----Xpression GUI----------------------

def track_process(process):
    """for use with the Java GUI, records process pid to file  
    """
    process_tracker.add(process.pid)  # add involves a disk write
    return process 
# used to replicate original code as closely as possible while still providing pid number 
#   access not easily gotten from os module calls.
# when check_call was not used, a simple .wait() is tacked onto the end of the Popen statement.

def remove_tracking(process):
    """Not used yet...
       for use with Java GUI, removes this pid from record
    """
    process_tracker.remove(process.pid)
    return process

def check_call(process):
    """maintains functionality of subprocess.check_call while returning process for possible tracking remove
    """
    process.wait() # assuming stdout = None, stderr = None
    if process.returncode != 0:
        raise CalledProcessError(process.returncode, process)
    return remove_tracking(process)
#----------------------------------------


def collect_stats(stat_collector, proc_list, verbose='no'):
    """take output of processes as --arg=value pairs, update stat_collector dictionary
    """
    try:
        #while len(proc_list) > 0:
        for proc in proc_list:
            #check_call(proc_list.pop())
            check_call(proc)

            barcode_items = proc.communicate()[0].split()
            # 4 lines per barcode for stats output 
            for i in range(0, len(barcode_items), 4): 
                output = parse_arguments(barcode_items[i:i+4])
                barcode = output['--barcode']
                if barcode not in stat_collector:
                    stat_collector[barcode] = dict(read_dict)
                stat_collector[barcode]['total_read'] += int(output['--total_read'])
                stat_collector[barcode]['good_read'] += int(output['--good_read'])
                stat_collector[barcode]['bad_read'] += int(output['--bad_read'])
        return []

    except CalledProcessError:
        #finally:
        for proc in proc_list:
          try:
            proc.poll()
            if not proc.returncode:
                    proc.terminate()
          except (IOError, OSError):
            if 'yes' in verbose: raise
            else: pass

        if not verbose or 'no' in verbose:
            sys.exit('ERROR: input fastq may be invalid. Please check the FASTQ format settings and the integrity of the file.\n') 
        raise
       

def parse_arguments(key_value_list):
    """Parses list of key-value pairs separated by '=', returns dicitonary of these items.
    """
    argument_dict = {}

    while key_value_list:
        if key_value_list[0][:2] == '--':                    # if the item in 'option_pair_list' begins with '--'
            key, value = key_value_list[0].strip().split('=')    #     --> start processing
            # strip leading --, replace any - in keyname with _
            #key = key[2:].replace('-', '_')
            argument_dict[key] = value                
            key_value_list = key_value_list[1:]

    return argument_dict

[docs]def xpression(input_file='', reference_fasta = '', reference_genbank = '', sample_name = 'RNAseq_xpression', sample_barcode = None, input_type = 'fastq', step='1', sample_strand_specific = 'no', sample_read_seq_direction = 'reverse_complement', compress = 'yes', allowed_mismatches = '2', number_of_cpu = '2', output_dir = os.getenv('HOME') + '/Xpression_results', barcode_multiplex = '', sample_bioseq_start = '1', verbose=''): """Process and analyse FASTQ file(s) `input_file` and write mapping statistics, expression profile and visualization. These analyses are processed over the following steps: 1. Reads are selected based on quality score and optional barcode. 2. Filtered reads are mapped to reference genome. Mapping statistics output. 3. Expression data is calculated from mapped reads and genbank file supplying gene boundaries and metadata. 4. Expression data is converted to '.wig' format for visualization in IGV, Artemis, and others. .. note:: All arguments are supplied as text, and correspond to command-line long-arguments. This pipeline's design is very close to a shell pipeline, where subprocesses are started from a parent and supplied with text positional or keyword arguments. Since most parameters will be passed on directly, keeping the input primarily as text allows less conversion to occur, allowing the code to be somewhat more clean. Folders and files specfic to `input_fastq` with be stored in `output_dir`/`input_fastq`/ **0_splitted_FASTQ**/ This folder contains the original FASTQ file, split for easy parallel processing. **1_merged_FASTA**/ This folder holds a folder for each `barcode` given in `barcode_multiplex` and `sample_barcode`. Each of these is used to store a file of all quality reads starting with the given barcode in bwa archive format. `sample_name`/ One folder for each sample run with Xpression will hold sample-specific files. These include: **2_sorted_BAM**/ The results of mapping this sample's quality reads to `reference_fasta`. **3_expression_profile**/ One expression profile for each reference mapped to, with loci defined by `reference_genbank`. **4_visualization**/ One visualization file for each reference will be stored here, in .wig format. **mapping_statistic.txt** This file is a table of mapping statistics of the sample, with sequencing quality, mapping rate, and mapping qualities. First, `input_file` will be filtered for sequencing quality and separated by barcode in :mod:`extract_reads_from_pools`. Next, `BWA <http://bio-bwa.sourceforge.net/>`_ maps the quality reads for each sample to the `reference_file` given. `reference_fasta` is a file of nucleic genome data in fasta format. This file may contain multiple references, such as the contigs in a draft genome. All reference names must match those located in the `reference_genbank` file if an expression profile is to be generated. BWA will use threads `number_of_cpu` and mismatches to the reference given by `allowed_mismatches`. .. note:: Unlike some other short-read aligners, BWA accepts in/dels and mismatches between read and reference. This flexibility allows high mapping to occur despite the relatively low quality of current next-gen sequencing, as well as allowing for point mutations. Its very small memory footprint recomends it as well. Then the `SAM <http://samtools.sourceforge.net/SAM1.pdf>`_ file produced by BWA is used with `reference_genbank` to count reads over genic and intergenic loci. For a read to be used in further processing, it must have a MAPQ score (Phred-scale) of 37 or greater. This quality threshold corresponds to read being unique, where reads with duplicate mapping locations are ignored. Last, a visualization is created in wiggle format from previously stored data corresponding to the expression profile. """ if '1' in step: if sample_barcode and 'no_barcode' not in sample_barcode: sample_barcode = sample_barcode.upper() barcode_multiplex = barcode_multiplex.upper() if sample_barcode not in barcode_multiplex: barcode_multiplex += ',' + sample_barcode else: sys.stderr.write('Sample appears to not be multiplexed, extracting all quality reads...\n') barcode_multiplex = 'no_barcode' sample_barcode = barcode_multiplex # check inter dependencies still_to_check = check_inter_deps(locals()) if not still_to_check: sys.stderr.write('ERROR: problem with option interdependancy\n') return 1 input_file = input_file.split(',') global read_dict read_dict = {'total_read': 0, 'good_read': 0, 'bad_read': 0, 'mapped_read': 0, 'unmapped_read': 0, 'uniquely_mapped_read':0, 'non_uniquely_mapped_read': 0, 'partially_mapped_read':0 } stat_collector = {} # {'barcode': dict(read_dict)} # Special dictionary to store orginal reference names for use in generating visualization file reference_name_lookup = {} # save this as tmp to avoid saving output dir in option file # opening this file again creates endless nesting seq_file_dir = output_dir + '/' + os.path.basename(input_file[0]).split('.')[0] sample_dir = seq_file_dir + '/' + sample_name sys.stderr.write('writing to %s\n' % seq_file_dir) if not os.path.exists(sample_dir): os.makedirs(sample_dir) # maybe don't make these excluded options part of the option_list #local = locals() # FIX: prints None for all values.. #with file(sample_dir + '/' + sample_name + '.option', 'w') as option_save: # option_save.writelines(('%s=%s\n' % (x, local.get(x)) for x in export_dict(local) if x not in 'option_filetimestampsample_dirverbose')) # file-centric, with barcoded samples in nested folder, reusing merged fasta from step 1 output_dir = seq_file_dir # STEP 1: Filter sequencing data # ------------------------------- if '1' in step: sys.stderr.write('STEP 1: Extract sequencing reads\n') # STEP 1.1: Make subdirectories # ----------------------------- # one folder for input_file for sub_dir in ['/0_splitted_FASTQ', '/1_merged_FASTA']: path = output_dir + sub_dir if not os.path.exists(path): os.makedirs(path) for sub_dir in ['/2_sorted_BAM', '/3_expression_profile', '/4_visualization']: path = sample_dir + sub_dir if not os.path.exists(path): os.makedirs(path) # STEP 1.2: Split the FASTQ file into parts # ----------------------------------------- assert type(input_file) == type([]), "input_file not a list!" if not os.path.exists(output_dir + '/1_merged_FASTA/' + sample_barcode + '/' + sample_barcode + '.fasta'): for in_file in input_file: sys.stderr.write('...splitting %s\n' % in_file) # subprocess.check_call(['sh', './splitter.sh', input_file, output_dir + '/0_splitted_FASTQ']) try: check_call(track_process(Popen((['sh', './splitter.sh', in_file, output_dir + '/0_splitted_FASTQ', '4000000'])))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: splitting the input sequence file %s for processing failed.\n' % in_file) return -1 raise # STEP 1.3: Extract reads from splitted FASTQ files # ------------------------------------------------- proc_list = [] no_running_process = 0 max_running_process = int(number_of_cpu) / 2 for splitted_fastq_file in sorted(os.listdir(output_dir + '/0_splitted_FASTQ')): sys.stderr.write('...extracting reads from \'%s\'\n' % splitted_fastq_file) proc_list.append(track_process(Popen(['python', './extract_reads_from_pools.py', output_dir + '/0_splitted_FASTQ/' + splitted_fastq_file, input_type, barcode_multiplex, sample_bioseq_start, output_dir + '/1_merged_FASTA/' + splitted_fastq_file + '.fa', verbose], stdout=subprocess.PIPE))) no_running_process += 1 if (no_running_process % max_running_process) == 0: proc_list = collect_stats(stat_collector, proc_list, verbose) if len(proc_list) > 0: proc_list = collect_stats(stat_collector, proc_list, verbose) # STEP 1.4: Write a merged fasta file # ----------------------------------- sys.stderr.write('...writing a merged fasta file\n') for barcode_dir in os.listdir(output_dir + '/1_merged_FASTA'): try: check_call(track_process(Popen('cat %s > %s' % ( pipes.quote(output_dir + '/1_merged_FASTA/' + barcode_dir) + '/*.fa', pipes.quote(output_dir + '/1_merged_FASTA/' + barcode_dir + '/' + barcode_dir + '.fasta')), shell=True))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: combining read archives for barcode %s failed.\n' % barcode_dir) return -1 raise # if previous fails, we don't want to save stats anyway with file(output_dir + '/stat.pickle', 'w') as pickle_file: pickle.dump(stat_collector, pickle_file) else: # reuse existing barcode extraction sys.stderr.write('...reusing merged fasta file %s.fasta\n' % sample_barcode) # STEP 2: Align reads to a reference genome # ------------------------------- if '2' in step: sys.stderr.write('STEP 2: Align reads to a reference\n') with file(output_dir + '/stat.pickle') as pickle_file: stat_collector = pickle.load(pickle_file) # STEP 2.0: Look for a reference genome. If not 'being indexed', do it # -------------------------------------------------------------------- sys.stderr.write('...looking for a reference genome\n') # bwa index -a is database.fasta if not os.path.exists(reference_fasta + '.sa'): try: check_call(track_process(Popen(['bwa', 'index', '-a', 'is', reference_fasta]))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: indexing the fasta reference file %s with bwa index failed.\n' % pipes.quote(reference_fasta)) return -1 raise # samtools faidx ref.fasta if not os.path.exists(reference_fasta + '.fai'): try: check_call(track_process(Popen(['samtools', 'faidx', reference_fasta]))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: indexing the fasta reference file %s with samtools faidx failed.\n' % pipes.quote(reference_fasta)) return -1 raise # STEP 2.1: Align reads to a bwa-indexed reference genome # ------------------------------------------------------- sys.stderr.write('...aligning reads\n') # bwa aln database.fasta short_read.fastq > aln_sa.sai check_call(track_process((Popen(('bwa aln ' + '-n %s ' % allowed_mismatches + '-t %s ' % number_of_cpu + '%s ' % pipes.quote(reference_fasta) + '%s ' % pipes.quote(output_dir + '/1_merged_FASTA/' + sample_barcode + '/' + sample_barcode + '.fasta') + '> %s' % pipes.quote(output_dir + '/' + sample_name + '/2_sorted_BAM/' + sample_name + '.sai')), shell=True)))) # bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam check_call(track_process(Popen(('bwa samse ' + '%s ' % pipes.quote(reference_fasta) + '%s ' % pipes.quote(output_dir +'/' + sample_name + '/2_sorted_BAM/' + sample_name + '.sai') + '%s ' % pipes.quote(output_dir + '/1_merged_FASTA/' + sample_barcode + '/' + sample_barcode + '.fasta') + '> %s' % pipes.quote(output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sam')), shell=True))) # STEP 2.2: Calculate mapping statistics # -------------------------------------------------------------------- sys.stderr.write('...calculating mapping statistic\n') # not tracking: for small sample, process is printing before parse_arg can catch stream output = parse_arguments(Popen(['awk', '-f', './count_mapping.awk', output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sam'], stdout=subprocess.PIPE).communicate()[0].split()) stat_collector[sample_barcode]['mapped_read'] = int(output['--mapped_read']) stat_collector[sample_barcode]['unmapped_read'] = int(output['--unmapped_read']) stat_collector[sample_barcode]['uniquely_mapped_read'] = int(output['--uniquely_mapped_read']) stat_collector[sample_barcode]['partially_mapped_read'] = int(output['--partially_mapped_read']) stat_collector[sample_barcode]['non_uniquely_mapped_read'] = int(output['--non_uniquely_mapped_read']) with file('%s/%s/mapping_statistic.txt' % (output_dir, sample_name), 'w') as stat_file: write_stat_file(stat_collector[sample_barcode], stat_file) with file(output_dir + '/stat.pickle', 'w') as pickle_file: pickle.dump(stat_collector, pickle_file) # STEP 3: Count reads mapped to each intergenic/genic region # ---------------------------------------------------------- if '3' in step: sys.stderr.write('STEP 3: Count reads mapped to each genomic region\n') with file(output_dir + '/stat.pickle') as pickle_file: stat_collector = pickle.load(pickle_file) # STEP 3.1: Generate BAM file # -------------------------------------------------------------------- sys.stderr.write('...generating BAM files\n') try: # samtools view -bt ref_list.txt -Sut ref.fasta.fai aln.sam | samtools sort - aln.sorted check_call(track_process(Popen('samtools view %s %s %s | samtools sort - %s' % ( '-Sut', pipes.quote(reference_fasta + '.fai'), pipes.quote(output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sam'), pipes.quote(output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sorted')),shell=True))) # samtools index aln.sorted.bam check_call(track_process(Popen(['samtools', 'index', output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sorted.bam']))) # delete SAM for disk space... #if os.path.exists(output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sorted.bam'): # os.remove(output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sam') except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: converting sam file to binary sam failed.\n') return -1 raise # STEP 3.2: Count reads per gene per contig # -------------------------------------------------------------------- chunk_size = 500 max_running_process = int(number_of_cpu) / 2 sequence_length = len(SeqIO.parse((output_dir + '/1_merged_FASTA/' + sample_barcode + '/' + sample_barcode + '.fasta'), 'fasta').next()) reference_list = pysam.Samfile(output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sorted.bam').references for reference_name in reference_list: substituted_reference_name = reference_name.replace('|', '_').replace('.', '_') if substituted_reference_name[-1] == '_': substituted_reference_name = substituted_reference_name[:-1] reference_annotation_pickle = reference_genbank + '.' + substituted_reference_name + '.annotation.pickle' #sys.stderr.write('looking for %s...\n' % reference_annotation_pickle) if not os.path.isfile(reference_annotation_pickle): sys.stderr.write('...creating internal annotation reference file from %s for reference %s\n' % (os.path.basename(reference_genbank), reference_name)) try: check_call(track_process(Popen(['python', './create_gene_annotation_pickle.py', reference_genbank, reference_annotation_pickle]))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: creating gene annotation internal file failed. Please make sure the genbank reference file contains full CDS information.\n') return -1 raise sys.stderr.write('...creating a gff3 annotation from genbank for IGV\n') # not a critical operation track_process(Popen(['python', './pickle2gff.py', reference_annotation_pickle, reference_name])) if not os.path.isfile(reference_annotation_pickle): sys.stderr.write('...skipping %s, no annotation file found\n' % reference_name) else: with file(reference_annotation_pickle) as ref_anno_file: ref_length = len(pickle.load(ref_anno_file)) # number of loci # Keep the original reference name for later use in Step 4 reference_name_lookup[substituted_reference_name] = str(reference_name) proc_list = [] no_running_process = 0 if not os.path.isdir(output_dir + '/' + sample_name +'/3_expression_profile/' + substituted_reference_name + '/visual_pickles'): os.makedirs(output_dir + '/' + sample_name +'/3_expression_profile/' + substituted_reference_name + '/visual_pickles') if not os.path.isdir(output_dir + '/' + sample_name +'/3_expression_profile/' + substituted_reference_name + '/expression_files'): os.makedirs(output_dir + '/' + sample_name +'/3_expression_profile/' + substituted_reference_name + '/expression_files') for chunk_r in xrange(0, ref_length, chunk_size): slice_start, slice_end = chunk_r, chunk_r + chunk_size if slice_end > ref_length: slice_end = ref_length sys.stderr.write('...counting reads mapped on %s from locus %d to %d\n' % (reference_name, slice_start, slice_end - 1)) proc_list.append(track_process(Popen('python ./count_reads_per_region.py %s %s %s %s %s %s %s %s %s %s %s > %s' % ( pipes.quote(sample_name + '_' + "{0:0{1}d}".format(chunk_r,len(str(ref_length)))), pipes.quote(output_dir + '/' + sample_name +'/2_sorted_BAM/' + sample_name + '.sorted.bam'), str(sequence_length), sample_read_seq_direction, sample_strand_specific, "'" + reference_name + "'", pipes.quote(reference_annotation_pickle), slice_start, slice_end, str(stat_collector[sample_barcode]['uniquely_mapped_read']), pipes.quote(output_dir + '/' + sample_name +'/3_expression_profile/' + substituted_reference_name + '/visual_pickles'), pipes.quote(output_dir + '/' + sample_name +'/3_expression_profile/' + substituted_reference_name + '/expression_files/' + sample_name) + '_' + "{0:0{1}d}".format(chunk_r,len(str(ref_length))) + '.' + substituted_reference_name + '.expression_profile.csv'), shell=True))) if len(proc_list) % max_running_process == 0: while len(proc_list) > 0: p = proc_list.pop() p.wait() remove_tracking(p) # done, finish remaining, odd-numbered while len(proc_list) > 0: p = proc_list.pop() p.wait() remove_tracking(p) for substituted_reference_name_dir in glob.glob(output_dir + '/' + sample_name + '/3_expression_profile/*'): if os.path.isdir(substituted_reference_name_dir): head, substituted_reference_name = os.path.split(substituted_reference_name_dir) if os.listdir(substituted_reference_name_dir + '/expression_files/'): # non-empty # merge expression files try: check_call(track_process(Popen(('cat %s > %s' % ( pipes.quote(substituted_reference_name_dir + '/expression_files') + '/*.expression_profile.csv', pipes.quote(output_dir + '/' + sample_name +'/3_expression_profile/' + sample_name + '.' + substituted_reference_name + '.expression_profile.csv'))), shell=True))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: combining expression profile files failed.\n') return -1 raise # merge visualisation pickles try: check_call(track_process(Popen(['python', './combine_visual_pickles.py', substituted_reference_name_dir + '/visual_pickles', output_dir + '/' + sample_name +'/3_expression_profile/' + sample_name + '.' + substituted_reference_name + '.rPM.visual.pickle']))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: combining visualisation files failed.\n') return -1 raise # save all filename-safe reference names with file(output_dir + '/reference_name.pickle', 'w') as pickle_file: pickle.dump(reference_name_lookup, pickle_file) # STEP 4: Generate visualization files # ---------------------------------------------------------- if '4' in step: sys.stderr.write('STEP 4: Visualize expression data\n') with file(output_dir + '/reference_name.pickle') as pickle_file: reference_name_lookup = pickle.load(pickle_file) for visual_file in glob.glob(output_dir + '/' + sample_name +'/3_expression_profile/' + '*.rPM.visual.pickle'): sys.stderr.write('...visualize data in %s\n' % visual_file.split('/')[-1]) reference_genome_name = visual_file.split('/')[-1].split('.')[1] try: check_call(track_process(Popen(['python', './visualize_expression_data.py', visual_file, sample_strand_specific, output_dir + '/' + sample_name +'/4_visualization/', reference_name_lookup[reference_genome_name]]))) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: combining visualisation files failed.\n') return -1 raise if 'yes' in compress: sys.stderr.write('...package and compress results to %s\n' % output_dir + '/' + sample_name + '_results.zip') try: check_call(track_process(Popen(['sh', './package_results.sh', output_dir + '/' + sample_name + '_results', # 'tgz', 'zip', sample_dir ]) )) except CalledProcessError: if not verbose or 'no' in verbose: sys.stderr.write('ERROR: packaging results failed.\n') # return else: raise return 0 # ---- Command-line option parsing ------ # user can give input # '--sample_name' : default: Xpression0, Xpression1, etc # '--sample_barcode' : default: '' # '--input_type' : default: 'fastq' # '--sample_bioseq_start' : default: len(barcode) + 1 # '--step' : default: 1;2;3;4 # default values okay # '--sample_prep_method' : default: 'RNAseq' # '--sample_strand_specific': default: no # as flag # '--sample_read_seq_direction' : default: 'reverse_complement' # '--allowed_mismatches' : default: '2' # '--number_of_cpu' : default: '2' # no default value # '--input_file' : fastq # '--output_dir' : valid dir location # '--reference_fasta' : genome fasta # '--reference_genbank' : genome genbank
def check_inter_deps(options): """Check argument interdependancies. This should be called as an alternative to validate_arg(). For instance, if pipeline isn't run past step 2, reference_genbank is not needed. """ try: if '2' in options['step'] and not validate_arg('reference_fasta', options['reference_fasta']): sys.exit('ERROR: invalid fasta file %s' % options['reference_fasta']) else: options.pop('reference_fasta') if '3' in options['step'] and not validate_arg('reference_genbank', options['reference_genbank']): sys.exit('ERROR: invalid genbank file %s' % options['reference_genbank']) else: options.pop('reference_genbank') except TypeError: return None except KeyError: return None return options def get_option_parser(): """ Returns option parser specific to Xpression. Takes all options, as well as input and reference files. """ """ TODO: json/pickle metadata? requires 2.6+ { "help": "helpmessage", "inputfile": "filename", "optionfile": "filename", "optiongroups": { "sample": { "name": "rnaseq" }, "general": { "mismatch": 2 } } } """ usage = "usage: %prog [options] fastq_file[,fastq_file2[,fastq_fileN]]" parser = OptionParser(usage=usage) file_group = OptionGroup(parser, "Reference Files") details_group = OptionGroup(parser, "Sample Options") general_group = OptionGroup(parser, "General Options") parser.add_option('--option-file', '-p', metavar='FILE', help='option file with lines of --argument-name=value. if supplied, any other command-line argument given is ignored.') parser.add_option('-v', dest='verbose', action='store_true', help='print full trace and raise exception if errors occur, otherwise print simple explanation an quit') file_group.add_option('--reference-fasta', '-f', metavar='FILE', help='fasta file for mapping reads') file_group.add_option('--reference-genbank', '-g', metavar='FILE', help='genbank file for calculating gene expression') details_group.add_option('--sample-name', '-n', metavar='STR', help='unique sample identifier for output naming, associated with sample-barcode') details_group.add_option('--sample-barcode', '-b', metavar='STR', default='no_barcode', help='barcode for this sample if multiplexed') details_group.add_option('--sample-bioseq-start', '-i', metavar='INT', # type='int', help='1-based index of biological sequence start, ie after barcode') details_group.add_option('--input-type', '-q', metavar='fastq|fastq-illumina', choices=['fastq', 'fastq-illumina'], help='fastq sequencing quality score format, see Bio.SeqIO.') details_group.add_option('--step', '-s', metavar='1234', help='step(s) to run, ie any subsequence of 1,2,3,4') details_group.add_option('--sample-strand-specific', '-S', action='store_true', help='strand direction preserved by library prep.') details_group.add_option('--sample-read-seq-direction', '-d', metavar= 'reverse_complement|native', choices=['reverse_complement', 'native'], help='strand direction of actual read sequences.') general_group.add_option('--no-compress', '-X', action='store_false', dest='compress', help='do not package results') general_group.add_option('--barcode-multiplex', '-e', metavar='BC1,BC2', help="comma-separated list of barcodes to extract from input file") general_group.add_option('--allowed-mismatches', '-m', metavar='INT', # type='int', help='number of mismatches allowed mapping reads to genome') general_group.add_option('--number-of-cpu', '-c', metavar='INT', dest='number_of_cpu', # type='int', help='number of processes to run during parallel processing') general_group.add_option('--output-dir', '-o', metavar='DIR', dest='output_dir', help='directory for output folders and files') parser.add_option_group(file_group) parser.add_option_group(details_group) parser.add_option_group(general_group) return parser def parse_command_line(): """Parses command line, getting option parser, reconciling arguments with options, dependancies, etc. If Xpression.py run without options or arguements, displays help and exits Returns option_list as dictionary to Xpression.py, which maintains existing functionality, consistent with option_file parsing. """ # do something like echo -n $(cat option_file) to put option file into args if needed parser = get_option_parser() if len(sys.argv) == 1: sys.exit(parser.print_help()) (options, args) = parser.parse_args() option_dict = dict(options.__dict__) if options.option_file: try: with file(options.option_file) as option_f: option_dict = parse_arguments(option_f.readlines()) sys.stderr.write('found file \'%s\': ignoring other arguments.\n' % option_f.name) except IOError: sys.exit('ERROR: option file does not exist.') else: if len(args) > 0: # and file exists option_dict['input_file'] = ','.join((x for x in args if len(x) > 0)) else: sys.exit('ERROR: missing input sequencing file') #option_dict.pop('option_file') parser.destroy() return option_dict #------------------------------------------------------ def export_dict(options): """return dict of strings from list, int and boolean types """ ret_dict = {} value = None for option in options: value = options[option] if type(value) == type([]): value = ','.join(value) else: value = str(value) ret_dict[option] = value return ret_dict def import_dict(options): """create dict of types like lists, bools, ints, from strings like 'yes' and comma-separated lists """ ret_dict = {} type_conv = {True : 'yes', False : 'no', None: ''} value = None for option in options: value = options[option] if '--' in option[:2]: option = option[2:] if value: if value in type_conv: value = type_conv[value] else: value = str(value) ret_dict[option] = value return ret_dict if __name__ == '__main__': # ----Xpression GUI---------------------- # tracks process pids for SIGTERM from another python function # called by GUI since Java only has Process.destroy, which # doesn't allow cleanup, and results in 'orphaned/zombie' processes process_tracker = processTracker() # --------------------------------------- # STEP 0: Parse options from command-line option_dict = import_dict(parse_command_line()) sys.exit(xpression(**option_dict))