【转】用Python实现merge vcf

转自:http://blog.sina.com.cn/s/blog_c32816210102x6sl.html

#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
A general module for merging files.

Author: Shujia Huang
Date: 2017-04-26
"""
import os
import sys
import heapq
import gzip
import re

class FileForQueueing(object):
    """
    """
    def __init__(self, the_file, line):
        """
        Store the file, and initialise the current value
        """
        self.the_file = the_file
        self.finishedReadingFile = False
        self.heap = []

        line = line
        cols = line.strip().split("\t")
        chrom = cols[0]

        # Where possible, convert chromosome names into
        # integers for sorting. If not possible, use
        # original names.
        try:
            chrom = int(chrom.upper().strip("CHR"))
        except Exception:
            pass

        pos = int(cols[1])
        heapq.heappush(self.heap, (chrom, pos, line))

        while not self.finishedReadingFile and len(self.heap) < 100:

            try:
                line = self.the_file.next()
                cols = line.strip().split("\t")
                chrom = cols[0]
                try:
                    chrom = int(chrom.upper().strip("CHR"))
                except Exception:
                    pass

                pos = int(cols[1])
            except StopIteration:
                self.finishedReadingFile = True
                break

            heapq.heappush(self.heap, (chrom, pos, line))

        # Now take the top line
        self.chrom, self.pos, self.line = heapq.heappop(self.heap)

    #def __cmp__(self, other):
    #    """
    #    Comparison function. Utilises the comparison function defined in
    #    the AlignedRead class.
    #    """
    #    return cmp(self.chrom, other.chrom) or cmp(self.pos, other.pos)

    def __del__(self):
        """
        Destructor
        """
        self.the_file.close()

    def next(self):
        """
        Increment the iterator and yield the new value. Also, store the
        current value for use in the comparison function.
        """
        if not self.finishedReadingFile:

            try:
                line = self.the_file.next()
                #cols = line.strip().split('\t')
                cols = line.strip().split()
                chrom = cols[0]

                try:
                    chrom = int(chrom.upper().strip("CHR"))
                except Exception:
                    pass

                pos = int(cols[1])
                heapq.heappush(self.heap, (chrom, pos, line))

            except StopIteration:
                self.finishedReadingFile = True

        if len(self.heap) != 0:
            # Now take the top line
            self.chrom, self.pos, self.line = heapq.heappop(self.heap)
        else:
            raise StopIteration


def expandedOpen(path, mode):
    try:
        return open(path, mode)
    except IOError:
        return open(os.path.expanduser(path), mode)


def Open(fileName, mode, compressLevel=9):
    """
    Function that allows transparent usage of dictzip, gzip and
    ordinary files
    """
    if fileName.endswith(".gz") or fileName.endswith(".GZ"):
        fileDir = os.path.dirname(fileName)
        if os.path.exists(fileDir):
            return gzip.GzipFile(fileName, mode, compressLevel)
        else:
            return gzip.GzipFile(os.path.expanduser(fileName), mode,
                                 compressLevel)
    else:
        return expandedOpen(fileName, mode)
def merge_files(temp_file_names, final_file_name):
    """
    Merging output VCF/CVG files into a final file
    """

    # Final output file
    if final_file_name == "-":
        output_file = sys.stdout
    else:
        output_file = Open(final_file_name, 'wb')
    the_heap = []

    # Initialise queue
    for index, file_name in enumerate(temp_file_names):
        the_file = Open(file_name, 'rb')

        for line in the_file:

            # End of this file
            if line[0] == "#":
                if index == 0:
                    line = re.sub('Number=[A-Za-z]','Number=.',line)# modify the header here
                    output_file.write(line)
            else:
                the_file_for_queueing = FileForQueueing(the_file, line)
                heapq.heappush(the_heap, the_file_for_queueing)
                break

        # If there are no calls in the temp file, we still want to
        # remove it.
        else:
            the_file.close()

    # Merge-sort the output using a priority queue
    while len(the_heap) != 0:

        # Get file from heap in right order
        next_file = heapq.heappop(the_heap)
        output_file.write(next_file.line)

        # Put file back on heap
        try:
            next_file.next()
            heapq.heappush(the_heap, next_file)
        except StopIteration:
            continue
    if final_file_name != "-":
        output_file.close()

    #log.info("Finished merging %s file(s)"%final_file_name)
    return
if __name__ == '__main__':
    in_file_list = sys.argv[1:-1]
    out_file = sys.argv[-1]
    merge_files(in_file_list,out_file)

猜你喜欢

转载自blog.csdn.net/weixin_42284584/article/details/80719618