真核微生物基因组质量评估工具EukCC的安装和详细使用方法

介绍：

GitHub - EBI-Metagenomics/EukCC: Tool to estimate genome quality of microbial eukaryotes

安装：

docker：

docker pull  microbiomeinformatics/eukcc

推荐conda 环境：

conda install -c conda-forge -c bioconda "eukcc>=2"
# mamba更快
mamba install -c conda-forge -c bioconda "eukcc>=2"

pip install eukcc

数据库配置，docker记得映射目录

mkdir eukccdb
cd eukccdb
wget http://ftp.ebi.ac.uk/pub/databases/metagenomics/eukcc/eukcc2_db_ver_1.1.tar.gz
tar -xzvf eukcc2_db_ver_1.1.tar.gz

数据库下载地址：Index of /pub/databases/metagenomics/eukcc

下载数据库注意版本，一般选版本2吧，

链接：https://ftp.ebi.ac.uk/pub/databases/metagenomics/eukcc/eukcc2_db_ver_1.2.tar.gz

https://ftp.ebi.ac.uk/pub/databases/metagenomics/eukcc/eukcc_db_v1.1.tar.gz

还有个副产品，diamond的数据库，不过好像看不出是diamond的哪个版本生成的，用的时候不好用的话就用下载的数据库再生成一遍吧。

https://ftp.ebi.ac.uk/pub/databases/metagenomics/eukcc/uniref50_20200213_tax.dmnd

如果不知道数据库位置，或者软件找不到位置，那就简单吧，设置DB目录

export EUKCC2_DB=/path/to/.../eukcc2_db_ver_1.1

快速开始

#EukCC on a single MAG
#We assume that you did set you $EUKCC2_DB to the correct location. If not please use the --db flag to pass the database to EukCC.

eukcc single --out outfolder --threads 8 bin.fa
#EukCC will then run on 8 threads. You can pass nucleotide fastas or proteomes to EukCC. It will automatically try to detect if it has to predict proteins or not.

#By default it will never use more than a single threads for placing the genomes in the reference tree, to save memory.

#EukCC on a folder of bins
eukcc folder --out outfolder --threads 8 bins
#EukCC will assume that the folder contains files with the suffix .fa. If that is not the case please adjust the parameter.

序列拼接流程

双端序列需要先构建bam索引

cat binfolder/*.fa > pseudo_contigs.fasta
bwa index pseudo_contigs.fasta
bwa mem -t 8 pseudo_contigs.fasta reads_1.fastq.gz reads_2.fastq.gz  |
    samtools view -q 20 -Sb - |
    samtools sort -@ 8 -O bam - -o alignment.bam
samtools index alignment.bam

利用py脚本生成关联表

binlinks.py  --ANI 99 --within 1500 \
    --out linktable.csv binfolder alignment.bam

If you have multiple bam files, pass all of them to the script (e.g. *.bam).

You will obtain a three column file (bin_1,bin_2,links).

拼接bins

eukcc folder \
    --out outfolder \
    --threads 8  \
    --links linktable.csv \
    binfolder

EukCC 首先将分别对所有bins进行运行。随后，它会识别那些至少达到50%完整度但尚未超过100-improve_percent的中等质量bins。接下来，它会找出那些通过至少100对端读配对与这些中等质量bins相连接的bins。若经过合并后bin的质量评分有所提高，则该bin将会被合并。

已合并的bins可以在输出文件夹中找到。

警示

blinks.py

#!/usr/bin/env python3
import pysam
from Bio import SeqIO
from collections import defaultdict
import os
import argparse
import logging
import csv


def is_in(read, contig_map, within=1000):
    if read.reference_name not in contig_map.keys():
        return False
    if read.reference_start <= within or read.reference_end <= within:
        return True
    elif read.reference_start > (
        contig_map[read.reference_name] - within
    ) or read.reference_end > (contig_map[read.reference_name] - within):
        return True
    else:
        return False


def keep_read(read, contig_map, within=1000, min_ANI=98, min_cov=0):
    ani = (
        (read.query_alignment_length - read.get_tag("NM"))
        / float(read.query_alignment_length)
        * 100
    )
    cov = read.query_alignment_length / float(read.query_length) * 100

    if ani >= min_ANI and cov >= min_cov and is_in(read, contig_map, within) is True:
        return True
    else:
        return False


def contig_map(bindir, suffix=".fa"):
    m = {}
    for f in os.listdir(bindir):
        if f.endswith(suffix) is False:
            continue
        path = os.path.join(bindir, f)
        with open(path, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                m[record.name] = len(record.seq)
    return m


def bin_map(bindir, suffix=".fa"):
    contigs = defaultdict(str)
    contigs_per_bin = defaultdict(int)
    for f in os.listdir(bindir):
        if f.endswith(suffix) is False:
            continue
        path = os.path.join(bindir, f)
        binname = os.path.basename(f)
        with open(path, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                contigs[record.name] = binname
                contigs_per_bin[binname] += 1
    return contigs, contigs_per_bin


def read_pair_generator(bam):
    """
    Generate read pairs in a BAM file or within a region string.
    Reads are added to read_dict until a pair is found.
    From: https://www.biostars.org/p/306041/
    """
    read_dict = defaultdict(lambda: [None, None])
    for read in bam.fetch():
        if not read.is_paired or read.is_secondary or read.is_supplementary:
            continue
        qname = read.query_name
        if qname not in read_dict:
            if read.is_read1:
                read_dict[qname][0] = read
            else:
                read_dict[qname][1] = read
        else:
            if read.is_read1:
                yield read, read_dict[qname][1]
            else:
                yield read_dict[qname][0], read
            del read_dict[qname]


def read_bam_file(bamf, link_table, cm, within, ANI):
    samfile = pysam.AlignmentFile(bamf, "rb")

    # generate link table
    logging.info("Parsing Bam file. This can take a few moments")
    for read, mate in read_pair_generator(samfile):
        if keep_read(read, cm, within, min_ANI=ANI) and keep_read(
            mate, cm, within, min_ANI=ANI
        ):
            # fill in the table
            link_table[read.reference_name][mate.reference_name] += 1
            if read.reference_name != mate.reference_name:
                link_table[mate.reference_name][read.reference_name] += 1

    return link_table


def main():
    # set arguments
    # arguments are passed to classes
    parser = argparse.ArgumentParser(
        description="Evaluate completeness and contamination of a MAG."
    )
    parser.add_argument("bindir", type=str, help="Run script on these bins")
    parser.add_argument(
        "bam",
        type=str,
        help="Bam file(s) with reads aligned against all contigs making up the bins",
        nargs="+",
    )
    parser.add_argument(
        "--out",
        "-o",
        type=str,
        required=False,
        help="Path to output table (Default: links.csv)",
        default="links.csv",
    )
    parser.add_argument(
        "--ANI", type=float, required=False, help="ANI of matching read", default=99
    )
    parser.add_argument(
        "--within",
        type=int,
        required=False,
        help="Within this many bp we need the read to map",
        default=1000,
    )
    parser.add_argument(
        "--contigs",
        "-c",
        action="store_true",
        default=False,
        help="Instead of bins print contigs",
    )
    parser.add_argument(
        "--quiet",
        "-q",
        dest="quiet",
        action="store_true",
        default=False,
        help="Silcence most output",
    )
    parser.add_argument(
        "--debug",
        "-d",
        action="store_true",
        default=False,
        help="Debug and thus ignore safety",
    )
    args = parser.parse_args()

    # define logging
    logLevel = logging.INFO
    if args.quiet:
        logLevel = logging.WARNING
    elif args.debug:
        logLevel = logging.DEBUG
    logging.basicConfig(
        format="%(asctime)s %(message)s",
        datefmt="%d-%m-%Y %H:%M:%S: ",
        level=logLevel,
    )

    bindir = args.bindir

    cm = contig_map(bindir)
    bm, contigs_per_bin = bin_map(bindir)
    logging.debug("Found {} contigs".format(len(cm)))

    link_table = defaultdict(lambda: defaultdict(int))
    bin_table = defaultdict(lambda: defaultdict(int))

    # iterate all bam files
    for bamf in args.bam:
        link_table = read_bam_file(bamf, link_table, cm, args.within, args.ANI)

    logging.debug("Created link table with {} entries".format(len(link_table)))

    # generate bin table
    for contig_1, dic in link_table.items():
        for contig_2, links in dic.items():
            bin_table[bm[contig_1]][bm[contig_2]] += links

    logging.debug("Created bin table with {} entries".format(len(bin_table)))

    out_data = []
    logging.debug("Constructing output dict")
    if args.contigs:
        for contig_1, linked in link_table.items():
            for contig_2, links in linked.items():
                out_data.append(
                    {
                        "bin_1": bm[contig_1],
                        "bin_2": bm[contig_2],
                        "contig_1": contig_1,
                        "contig_2": contig_2,
                        "links": links,
                        "bin_1_contigs": contigs_per_bin[bm[contig_1]],
                        "bin_2_contigs": contigs_per_bin[bm[contig_2]],
                    }
                )
    else:
        for bin_1, dic in bin_table.items():
            for bin_2, links in dic.items():
                out_data.append({"bin_1": bin_1, "bin_2": bin_2, "links": links})

    logging.debug("Out data has {} rows".format(len(out_data)))
    # results
    logging.info("Writing output")
    with open(args.out, "w") as fout:
        if len(out_data) > 0:
            cout = csv.DictWriter(fout, fieldnames=list(out_data[0].keys()))
            cout.writeheader()
            for row in out_data:
                cout.writerow(row)
        else:
            logging.warning("No rows to write")


if __name__ == "__main__":
    main()

scripts/filter_euk_bins.py

#!/usr/bin/env python3
#
# This file is part of the EukCC (https://github.com/openpaul/eukcc).
# Copyright (c) 2019 Paul Saary
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# provides all file operation functions
# used inthis package
import os
import argparse
import subprocess
import logging
import tempfile
import gzip
from multiprocessing import Pool


# backup fasta handler, so we can use readonly directories
class fa_class:
    def __init__(self, seq, name, long_name):
        self.seq = seq
        self.name = name
        self.long_name = long_name

    def __str__(self):
        return self.seq

    def __len__(self):
        return len(self.seq)


def Fasta(path):
    """
    Iterator for fasta files
    """
    entry = False

    with open(path) as fin:
        for line in fin:
            if line.startswith(">"):
                if entry is not False:
                    entry.seq = "".join(entry.seq)
                    yield entry
                # define new entry
                long_name = line.strip()[1:]
                name = long_name.split()[0]
                entry = fa_class([], name, long_name)
            else:
                entry.seq.append(line.strip())
        # yield last one
        entry.seq = "".join(entry.seq)
        yield entry


def gunzip(path, tmp_dir):
    """
    Gunzip a file for EukRep
    """
    if path.endswith(".gz"):
        fna_path = os.path.join(tmp_dir, "contigs.fna")
        logging.debug("Going to unzip fasta into {}".format(fna_path))
        with gzip.open(path, "r") as fin, open(fna_path, "w") as fout:
            for line in fin:
                fout.write(line.decode())
        path = fna_path
        logging.debug("Done unzipping {}".format(fna_path))

    return path


class EukRep:
    """Class to call and handle EukRep data"""

    def __init__(self, fasta, eukout, bacout=None, minl=1500, tie="euk"):
        self.fasta = fasta
        self.eukout = eukout
        self.bacout = bacout
        self.minl = minl
        self.tie = tie

    def run(self):
        # command list will be called
        cmd = [
            "EukRep",
            "--min",
            str(self.minl),
            "-i",
            self.fasta,
            "--seq_names",
            "-ff",
            "--tie",
            self.tie,
            "-o",
            self.eukout,
        ]
        if self.bacout is not None:
            cmd.extend(["--prokarya", self.bacout])

        subprocess.run(cmd, check=True, shell=False)

        self.read_result()

    def read_result(self):
        self.euks = self.read_eukfile(self.eukout)

        self.bacs = set()
        if self.bacout is not None:
            self.bacs = self.read_eukfile(self.bacout)

    def read_eukfile(self, path):
        lst = set()
        with open(path) as infile:
            for line in infile:
                lst.add(line.strip())
        return lst


class bin:
    def __init__(self, path, eukrep):
        self.e = eukrep
        self.bname = os.path.basename(path)
        self.path = os.path.abspath(path)

    def __str__(self):
        return "{} euks: {} bacs: {}".format(self.bname, self.table["euks"], self.table["bacs"])

    def stats(self):
        """read bin content and figure genomic composition"""
        logging.debug("Loading bin")
        fa_file = Fasta(self.path)
        stats = {"euks": 0, "bacs": 0, "NA": 0, "sum": 0}
        # loop and compute stats
        logging.debug("Make per bin stats")
        for seq in fa_file:
            if seq.name in self.e.euks:
                stats["euks"] += len(seq)
            elif seq.name in self.e.bacs:
                stats["bacs"] += len(seq)
            else:
                stats["NA"] += len(seq)

        stats["sum"] = sum([v for k, v in stats.items()])

        self.table = stats

    def decide(self, eukratio=0.2, bacratio=0.1, minbp=100000, minbpeuks=1000000):
        """
        rule to handle decision logic
        """
        keep = True
        allb = self.table["sum"]
        if self.table["euks"] < minbpeuks:
            keep = False
            logging.info(f"Eukaryotic DNA amount only {self.table['euks']} instead of target {minbpeuks}")

        elif self.table["euks"] / allb <= eukratio:
            keep = False
            logging.info(f"Eukaryotic DNA ratio not higher than {eukratio}")

        elif self.table["bacs"] / allb >= bacratio:
            keep = False
            logging.info(f"Bacterial DNA content higher than {bacratio}")

        elif self.table["sum"] < minbp:
            keep = False
            logging.info("We did not find at least %d bp of DNA", minbp)

        self.keep = keep


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", help="path for the output table", default="assignment.csv", type=str)
    parser.add_argument("bins", nargs="+", help="all bins to classify", type=str)
    parser.add_argument(
        "--threads",
        "-t",
        type=int,
        help="How many bins should be run in parallel (Default: 1)",
        default=1,
    )
    parser.add_argument(
        "--minl",
        type=int,
        help="define minimal length of contig for EukRep \
                        to classify (default: 1500)",
        default=1500,
    )
    parser.add_argument(
        "--eukratio",
        type=float,
        help="This ratio of eukaryotic DNA to all DNA has to be found\
                    at least (default: 0, ignore)",
        default=0,
    )
    parser.add_argument(
        "--bacratio",
        type=float,
        help="discard bins with bacterial ratio of higher than\
                    (default: 1, ignore)",
        default=1,
    )
    parser.add_argument(
        "--minbp",
        type=float,
        help="Only keep bins with at least n bp of dna\
                    (default: 8000000)",
        default=8000000,
    )
    parser.add_argument(
        "--minbpeuks",
        type=float,
        help="Only keep bins with at least n bp of Eukaryotic dna\
                    (default: 5000000)",
        default=5000000,
    )
    parser.add_argument("--rerun", action="store_true", help="rerun even if output exists", default=False)
    parser.add_argument("--quiet", action="store_true", help="supress information", default=False)
    parser.add_argument("--debug", action="store_true", help="Make it more verbose", default=False)

    args = parser.parse_args()

    # define logging
    logLevel = logging.INFO
    if args.quiet:
        logLevel = logging.WARNING
    elif args.debug:
        logLevel = logging.DEBUG
    logging.basicConfig(format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %H:%M:%S: ", level=logLevel)

    def evaluate_bin(path):
        if not os.path.exists(path):
            logging.error("Can not find file {}".format(path))
            exit(1)

        logging.info("Launch on {}".format(path))
        with tempfile.TemporaryDirectory(prefix="filter_EukRep_") as tmp_dir:
            logging.debug("Using tmp folder: {}".format(tmp_dir))
            eukfile = os.path.join(tmp_dir, "euks.fna")
            bacfile = os.path.join(tmp_dir, "bacs.fna")
            # EukRep can not deal with Gzipped Fasta files, so we unzip it in case it is a Gzip file
            path = gunzip(path, tmp_dir)
            # Launching EukRep
            logging.debug(f"Starting EukRep on {path}")
            eukrep_result = EukRep(path, eukfile, bacfile, minl=args.minl)
            eukrep_result.run()
            b = bin(path, eukrep_result)
            b.stats()
            b.decide(eukratio=args.eukratio, bacratio=args.bacratio, minbp=args.minbp, minbpeuks=args.minbpeuks)
            return b

    # multithreading pool
    pool = Pool(processes=args.threads)
    results = pool.map(evaluate_bin, args.bins)
    pool.close()
    pool.join()

    with open(args.output, "w") as outfile:
        outfile.write("path,binname,passed,bp_eukaryote,bp_prokaryote,bp_unassigned,bp_sum\n")
        for b in results:
            outfile.write(
                f"{b.path},{b.bname},{b.keep},{b.table['euks']},{b.table['bacs']},{b.table['NA']},{b.table['sum']}\n"
            )