Source code for flowcraft.templates.mashdist2json

#!/usr/bin/env python3

"""
Purpose
-------

This module is intended to generate a json output for mash dist results that
can be imported in pATLAS.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``mash_output`` : String with the name of the mash screen output file.
    - e.g.: ``'fastaFileA_mashdist.txt'``


Code documentation
------------------

"""

__version__ = "1.4.0"
__build__ = "04092018"
__template__ = "mashsdist2json-nf"

import json
import os

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    MASH_TXT = '$mashtxt'
    HASH_CUTOFF = '$shared_hashes'
    SAMPLE_ID = '$sample_id'
    ASSEMBLY_IN = '$fasta'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("MASH_TXT: {}".format(MASH_TXT))
    logger.debug("HASH_CUTOFF: {}".format(HASH_CUTOFF))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("ASSEMBLY_IN: {}".format(ASSEMBLY_IN))


[docs]def send_to_output(master_dict, mash_output, sample_id, assembly_file):
    """Send dictionary to output json file
    This function sends master_dict dictionary to a json file if master_dict is
    populated with entries, otherwise it won't create the file

    Parameters
    ----------
    master_dict: dict
        dictionary that stores all entries for a specific query sequence
        in multi-fasta given to mash dist as input against patlas database
    last_seq: str
        string that stores the last sequence that was parsed before writing to
        file and therefore after the change of query sequence between different
        rows on the input file
    mash_output: str
        the name/path of input file to main function, i.e., the name/path of
        the mash dist output txt file.
    sample_id: str
        The name of the sample being parse to .report.json file

    Returns
    -------

    """

    plot_dict = {}

    # create a new file only if master_dict is populated
    if master_dict:
        out_file = open("{}.json".format(
            "".join(mash_output.split(".")[0])), "w")
        out_file.write(json.dumps(master_dict))
        out_file.close()

        # iterate through master_dict in order to make contigs the keys
        for k,v in master_dict.items():
            if not v[2] in plot_dict:
                plot_dict[v[2]] = [k]
            else:
                plot_dict[v[2]].append(k)

        number_hits = len(master_dict)
    else:
        number_hits = 0

    json_dic = {
        "tableRow": [{
            "sample": sample_id,
            "data": [{
                "header": "Mash Dist",
                "table": "plasmids",
                "patlas_mashdist": master_dict,
                "value": number_hits
            }]
        }],
        "plotData": [{
            "sample": sample_id,
            "data": {
                "patlasMashDistXrange": plot_dict
            },
            "assemblyFile": assembly_file
        }]
    }

    with open(".report.json", "w") as json_report:
        json_report.write(json.dumps(json_dic, separators=(",", ":")))


@MainWrapper
def main(mash_output, hash_cutoff, sample_id, assembly_file):
    """
    Main function that allows to dump a mash dist txt file to a json file

    Parameters
    ----------
    mash_output: str
        A string with the input file.
    hash_cutoff: str
        the percentage cutoff for the percentage of shared hashes between query
        and plasmid in database that is allowed for the plasmid to be reported
        to the results outputs
    sample_id: str
        The name of the sample.
    """

    input_f = open(mash_output, "r")

    master_dict = {}

    for line in input_f:

        tab_split = line.split("\t")
        current_seq = tab_split[1].strip()
        ref_accession = "_".join(tab_split[0].strip().split("_")[0:3])
        mash_dist = tab_split[2].strip()
        hashes_list = tab_split[-1].strip().split("/")

        # creates a percentage of the shared hashes between the sample and the
        # reference
        perc_hashes = float(hashes_list[0]) / float(hashes_list[1])

        # if ref_accession already in dict, i.e., if the same accession number
        # matches more than one contig.
        if ref_accession in master_dict.keys():
            current_seq += ", {}".format(master_dict[ref_accession][-1])

        # assures that only the hashes with a given shared percentage are
        # reported to json file
        if perc_hashes > float(hash_cutoff):

            master_dict[ref_accession] = [
                round(1 - float(mash_dist), 2),
                round(perc_hashes, 2),
                current_seq
            ]

    # assures that file is closed in last iteration of the loop
    send_to_output(master_dict, mash_output, sample_id, assembly_file)


if __name__ == "__main__":

    main(MASH_TXT, HASH_CUTOFF, SAMPLE_ID, ASSEMBLY_IN)