Source code for flowcraft.templates.trimmomatic_report

#!/usr/bin/env python3

"""
Purpose
-------

This module is intended parse the results of the Trimmomatic log for a set
of one or more samples.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``log_files``: Trimmomatic log files.
    - e.g.: ``'Sample1_trimlog.txt Sample2_trimlog.txt'``


Generated output
----------------
- ``trimmomatic_report.csv`` : Summary report of the trimmomatic logs for\
    all samples

Code documentation
------------------

"""

__version__ = "1.0.0"
__build__ = "16012018"
__template__ = "trimmomatic_report-nf"

import os
import json

from collections import OrderedDict

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)


if __file__.endswith(".command.sh"):
    LOG_FILES = '$log_files'.split()


[docs]def parse_log(log_file): """Retrieves some statistics from a single Trimmomatic log file. This function parses Trimmomatic's log file and stores some trimming statistics in an :py:class:`OrderedDict` object. This object contains the following keys: - ``clean_len``: Total length after trimming. - ``total_trim``: Total trimmed base pairs. - ``total_trim_perc``: Total trimmed base pairs in percentage. - ``5trim``: Total base pairs trimmed at 5' end. - ``3trim``: Total base pairs trimmed at 3' end. Parameters ---------- log_file : str Path to trimmomatic log file. Returns ------- x : :py:class:`OrderedDict` Object storing the trimming statistics. """ template = OrderedDict([ # Total length after trimming ("clean_len", 0), # Total trimmed base pairs ("total_trim", 0), # Total trimmed base pairs in percentage ("total_trim_perc", 0), # Total trimmed at 5' end ("5trim", 0), # Total trimmed at 3' end ("3trim", 0), # Bad reads (completely trimmed) ("bad_reads", 0) ]) with open(log_file) as fh: for line in fh: # This will split the log fields into: # 0. read length after trimming # 1. amount trimmed from the start # 2. last surviving base # 3. amount trimmed from the end fields = [int(x) for x in line.strip().split()[-4:]] if not fields[0]: template["bad_reads"] += 1 template["5trim"] += fields[1] template["3trim"] += fields[3] template["total_trim"] += fields[1] + fields[3] template["clean_len"] += fields[0] total_len = template["clean_len"] + template["total_trim"] if total_len: template["total_trim_perc"] = round( (template["total_trim"] / total_len) * 100, 2) else: template["total_trim_perc"] = 0 return template
[docs]def write_report(storage_dic, output_file, sample_id): """ Writes a report from multiple samples. Parameters ---------- storage_dic : dict or :py:class:`OrderedDict` Storage containing the trimming statistics. See :py:func:`parse_log` for its generation. output_file : str Path where the output file will be generated. sample_id : str Id or name of the current sample. """ with open(output_file, "w") as fh, open(".report.json", "w") as json_rep: # Write header fh.write("Sample,Total length,Total trimmed,%,5end Trim,3end Trim," "bad_reads\\n") # Write contents for sample, vals in storage_dic.items(): fh.write("{},{}\\n".format( sample, ",".join([str(x) for x in vals.values()]))) json_dic = { "tableRow": [{ "sample": sample_id, "data": [ {"header": "trimmed", "value": vals["total_trim_perc"], "table": "qc", "columnBar": True}, ] }], "plotData": [{ "sample": sample_id, "data": { "sparkline": vals["clean_len"] } }], "badReads": vals["bad_reads"] } json_rep.write(json.dumps(json_dic, separators=(",", ":")))
@MainWrapper def main(log_files): """ Main executor of the trimmomatic_report template. Parameters ---------- log_files : list List of paths to the trimmomatic log files. """ log_storage = OrderedDict() for log in log_files: log_id = log.rstrip("_trimlog.txt") # Populate storage of current sample log_storage[log_id] = parse_log(log) # Remove temporary trim log file os.remove(log) write_report(log_storage, "trimmomatic_report.csv", log_id) if __name__ == '__main__': main(LOG_FILES)