Source code for assemblerflow.templates.mapping2json

#!/usr/bin/env python3


"""
Purpose
-------

This module is intended to generate a json output for mapping results that
can be imported in pATLAS.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``depth_file`` : String with the name of the mash screen output file.
    - e.g.: ``'samtoolsDepthOutput_sampleA.txt'``
- ``json_dict`` : the file that contains the dictionary with keys and values for
        accessions and their respective lengths.
    - e.g.: ``'reads_sample_result_length.json'``
- ``cutoff`` : The cutoff used to trim the unwanted matches for the minimum
        coverage results from mapping. This value may range between 0 and 1.
    - e.g.: ``0.6``


Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "20022018"
__template__ = "mapping2json-nf"

import os
import json

from assemblerflow_utils.assemblerflow_base import get_logger, MainWrapper

logger = get_logger(__file__)

if __file__.endswith(".command.sh"):
    DEPTH_TXT = '$depthFile'
    JSON_LENGTH = '$lengthJson'
    CUTOFF = '$params.cov_cutoff'
    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("DEPTH_TXT: {}".format(DEPTH_TXT))
    logger.debug("JSON_LENGHT: {}".format(JSON_LENGTH))
    logger.debug("CUTOFF: {}".format(CUTOFF))


[docs]def depthfilereader(depth_file, plasmid_length, cutoff): ''' Function that parse samtools depth file and creates 3 dictionaries that will be useful to make the outputs of this script, both the tabular file and the json file that may be imported by pATLAS Parameters ---------- depth_file: str the path to depth file for each sample plasmid_length: dict a dictionary that stores length of all plasmids in fasta given as input cutoff: str the cutoff used to trim the unwanted matches for the minimum coverage results from mapping. This is then converted into a float within this function in order to compare with the value returned from the perc_value_per_ref. Returns ------- percentage_basescovered: dict stores the percentage of the total sequence of a reference/accession (plasmid) in a dictionary ''' depth_dic_coverage = {} for line in depth_file: tab_split = line.split() # split by any white space reference = "_".join(tab_split[0].strip().split("_")[0:3]) # store # only the gi for the reference position = tab_split[1] numreadsalign = float(tab_split[2].rstrip()) if reference not in depth_dic_coverage: depth_dic_coverage[reference] = {} depth_dic_coverage[reference][position] = numreadsalign percentage_basescovered = {} for ref in depth_dic_coverage: # calculates the percentage value per each reference perc_value_per_ref = float(len(depth_dic_coverage[ref])) / \ float(plasmid_length[ref]) # checks if percentage value is higher or equal to the cutoff defined if perc_value_per_ref >= float(cutoff): percentage_basescovered[ref] = perc_value_per_ref return percentage_basescovered
@MainWrapper def main(depth_file, json_dict, cutoff): ''' Function that handles the inputs required to parse depth files from bowtie and dumps a dict to a json file that can be imported into pATLAS. Parameters ---------- depth_file: str the path to depth file for each sample json_dict: str the file that contains the dictionary with keys and values for accessions and their respective lengths cutoff: str the cutoff used to trim the unwanted matches for the minimum coverage results from mapping. This value may range between 0 and 1. ''' # check for the appropriate value for the cutoff value for coverage results try: cutoff_val = float(cutoff) except ValueError: logger.error("Cutoff value should be a string such as: '0.6'. " "The outputted value: {}. Make sure to provide an " "appropriate value for --cov_cutoff".format(cutoff)) # loads dict from file, this file is provided in docker image plasmid_length = json.load(open(json_dict)) # read depth file depth_file_reader = open(depth_file) # first reads the depth file and generates dictionaries to handle the input # to a simpler format logger.info("Reading depth file and creating dictionary to dump") percentage_basescovered = depthfilereader(depth_file_reader, plasmid_length, cutoff_val) # then dump do file output_json = open("{}_mapping.json".format(depth_file), "w") logger.info("Dumping to {}".format("{}_mapping.json".format(depth_file))) output_json.write(json.dumps(percentage_basescovered)) output_json.close() if __name__ == "__main__": main(DEPTH_TXT, JSON_LENGTH, CUTOFF)