Source code for emase.AlignmentMatrixFactory

#!/usr/bin/env python

import os
import pysam
import tables
from scipy.sparse import coo_matrix
import numpy as np
import struct


[docs]class AlignmentMatrixFactory(): def __init__(self, alnfile): #, haplotype_names=None, locus_names=None, read_names=None): self.alnfile = alnfile self.hname = None self.lname = None self.rname = None self.tmpfiles = None # self.num_alignments = None
[docs] def prepare(self, haplotypes, loci, delim='_', outdir=None): if len(haplotypes) > 0: # Suffices given self.hname = haplotypes else: # Suffix not given self.hname = ['h0'] self.lname = loci self.rname = set() fh = pysam.Samfile(self.alnfile, 'rb') for aln in fh.fetch(until_eof=True): self.rname.add(aln.qname) self.rname = np.array(sorted(list(self.rname))) num_loci = len(self.lname) num_reads = len(self.rname) lid = dict(zip(self.lname, np.arange(num_loci))) rid = dict(zip(self.rname, np.arange(num_reads))) self.tmpfiles = dict.fromkeys(self.hname) if outdir is None: outdir = os.path.dirname(self.alnfile) fhout = dict.fromkeys(self.hname) for hap in self.hname: outfile = os.path.join(outdir, "%s_%d.bin" % (hap, os.getpid())) self.tmpfiles[hap] = outfile fhout[hap] = open(outfile, "wb") fh = pysam.Samfile(self.alnfile, 'rb') if len(haplotypes) > 0: # Suffices given for aln in fh.fetch(until_eof=True): if aln.flag != 4 and aln.flag != 8: locus, hap = fh.getrname(aln.tid).split(delim) fhout[hap].write(struct.pack('>I', rid[aln.qname])) fhout[hap].write(struct.pack('>I', lid[locus])) else: # Suffix not given hap = self.hname[0] for aln in fh.fetch(until_eof=True): if aln.flag != 4 and aln.flag != 8: locus = fh.getrname(aln.tid) fhout[hap].write(struct.pack('>I', rid[aln.qname])) fhout[hap].write(struct.pack('>I', lid[locus])) for hap in self.hname: fhout[hap].close()
[docs] def produce(self, h5file, title='Alignments', index_dtype='uint32', data_dtype=float, complib='zlib', incidence_only=True): h5fh = tables.open_file(h5file, 'w', title=title) fil = tables.Filters(complevel=1, complib=complib) h5fh.set_node_attr(h5fh.root, 'incidence_only', incidence_only) h5fh.set_node_attr(h5fh.root, 'mtype', 'csc_matrix') h5fh.set_node_attr(h5fh.root, 'shape', (len(self.lname), len(self.hname), len(self.rname))) h5fh.set_node_attr(h5fh.root, 'hname', self.hname) h5fh.create_carray(h5fh.root, 'lname', obj=self.lname, title='Locus Names', filters=fil) h5fh.create_carray(h5fh.root, 'rname', obj=self.rname, title='Read Names', filters=fil) for hid in xrange(len(self.hname)): hap = self.hname[hid] infile = self.tmpfiles[hap] dmat = np.fromfile(open(infile, 'rb'), dtype='>I') dmat = dmat.reshape((len(dmat)/2, 2)).T if dmat.shape[0] > 2: dvec = dmat[2] else: dvec = np.ones(dmat.shape[1]) spmat = coo_matrix((dvec, dmat[:2]), shape=(len(self.rname), len(self.lname))) spmat = spmat.tocsc() hgroup = h5fh.create_group(h5fh.root, 'h%d' % hid, 'Sparse matrix components for Haplotype %d' % hid) i1 = h5fh.create_carray(hgroup, 'indptr', obj=spmat.indptr.astype(index_dtype), filters=fil) i2 = h5fh.create_carray(hgroup, 'indices', obj=spmat.indices.astype(index_dtype), filters=fil) if not incidence_only: d = h5fh.create_carray(hgroup, 'data', obj=spmat.data.astype(data_dtype), filters=fil) h5fh.flush() h5fh.close()
[docs] def cleanup(self): for tmpfile in self.tmpfiles.itervalues(): os.remove(tmpfile)
if __name__ == "__main__": pass # TODO: Put some simple test