Source code for flowcraft.templates.integrity_coverage

#!/usr/bin/env python3

"""
Purpose
-------

This module receives paired FastQ files, a genome size estimate and a minimum
coverage threshold and has three purposes while iterating over the FastQ files:

    - Checks the integrity of FastQ files (corrupted files).
    - Guesses the encoding of FastQ files (this can be turned off in the \
    ``opts`` argument).
    - Estimates the coverage for each sample.

Expected input
--------------

The following variables are expected whether using NextFlow or the
:py:func:`main` executor.

- ``sample_id`` : *Sample Identification string*
    - e.g.: ``'SampleA'``

- ``fastq_pair`` : *Pair of FastQ file paths*
    - e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``

- ``gsize`` : *Expected genome size*
    - e.g.: ``'2.5'``

- ``cov`` : *Minimum coverage threshold*
    - e.g.: ``'15'``

- ``opts`` : *Specify additional arguments for executing integrity_coverage. \
    The arguments should be a string of command line arguments, such as \
    '-e'. The accepted arguments are:*
    - ``'-e'`` : Skip encoding guess.

Generated output
----------------

The generated output are output files that contain an object, usually a string.
(Values within ``${}`` are substituted by the corresponding variable.)

- ``${sample_id}_encoding`` : Stores the encoding for the sample FastQ. If no \
    encoding could be guessed, write 'None' to file.
    - e.g.: ``'Illumina-1.8'`` or ``'None'``

- ``${sample_id}_phred`` : Stores the phred value for the sample FastQ. If no \
    phred could be guessed, write 'None' to file.
    - ``'33'`` or ``'None'``

- ``${sample_id}_coverage`` : Stores the expected coverage of the samples, \
    based on a given genome size.
    - ``'112'`` or ``'fail'``

- ``${sample_id}_report`` : Stores the report on the expected coverage \
    estimation. This string written in this file will appear in the \
    coverage report.
    - ``'${sample_id}, 112, PASS'``

- ``${sample_id}_max_len`` : Stores the maximum read length for the current \
    sample.
    - ``'152'``

Notes
-----

In case of a corrupted sample, all expected output files should have
``'corrupt'`` written.


Code documentation
------------------

"""

__version__ = "1.0.1"
__build__ = "03082018"
__template__ = "integrity_coverage-nf"

import os
import bz2
import gzip
import json
import zipfile

from itertools import chain

from flowcraft_utils.flowcraft_base import get_logger, MainWrapper

logger = get_logger(__file__)

# Set constants when running from Nextflow
if __file__.endswith(".command.sh"):
    # CONSTANTS
    FASTQ_PAIR = '$fastq_pair'.split()
    SAMPLE_ID = '$sample_id'
    GSIZE = float('$gsize')
    MINIMUM_COVERAGE = float('$cov')
    OPTS = '$opts'.split()

    logger.debug("Running {} with parameters:".format(
        os.path.basename(__file__)))
    logger.debug("SAMPLE_ID: {}".format(SAMPLE_ID))
    logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
    logger.debug("GSIZE: {}".format(GSIZE))
    logger.debug("MINIMUM_COVERAGE: {}".format(MINIMUM_COVERAGE))
    logger.debug("OPTS: {}".format(OPTS))

RANGES = {
    'Sanger': [33, (33, 73)],
    'Illumina-1.8': [33, (33, 74)],
    'Solexa': [64, (59, 104)],
    'Illumina-1.3': [64, (64, 104)],
    'Illumina-1.5': [64, (66, 105)]
}
"""
dict: Dictionary containing the encoding values for several fastq formats. The
key contains the format and the value contains a list with the corresponding
phred score and a list with the range of encodings.
"""

COPEN = {
    "gz": gzip.open,
    "bz2": bz2.open,
    "zip": zipfile.ZipFile
}

MAGIC_DICT = {
    b"\\x1f\\x8b\\x08": "gz",
    b"\\x42\\x5a\\x68": "bz2",
    b"\\x50\\x4b\\x03\\x04": "zip"
}
"""
dict: Dictionary containing the binary signatures for three compression formats
(gzip, bzip2 and zip).
"""


[docs]def guess_file_compression(file_path, magic_dict=None): """Guesses the compression of an input file. This function guesses the compression of a given file by checking for a binary signature at the beginning of the file. These signatures are stored in the :py:data:`MAGIC_DICT` dictionary. The supported compression formats are gzip, bzip2 and zip. If none of the signatures in this dictionary are found at the beginning of the file, it returns ``None``. Parameters ---------- file_path : str Path to input file. magic_dict : dict, optional Dictionary containing the signatures of the compression types. The key should be the binary signature and the value should be the compression format. If left ``None``, it falls back to :py:data:`MAGIC_DICT`. Returns ------- file_type : str or None If a compression type is detected, returns a string with the format. If not, returns ``None``. """ if not magic_dict: magic_dict = MAGIC_DICT max_len = max(len(x) for x in magic_dict) with open(file_path, "rb") as f: file_start = f.read(max_len) logger.debug("Binary signature start: {}".format(file_start)) for magic, file_type in magic_dict.items(): if file_start.startswith(magic): return file_type return None
[docs]def get_qual_range(qual_str): """ Get range of the Unicode encode range for a given string of characters. The encoding is determined from the result of the :py:func:`ord` built-in. Parameters ---------- qual_str : str Arbitrary string. Returns ------- x : tuple (Minimum Unicode code, Maximum Unicode code). """ vals = [ord(c) for c in qual_str] return min(vals), max(vals)
[docs]def get_encodings_in_range(rmin, rmax): """ Returns the valid encodings for a given encoding range. The encoding ranges are stored in the :py:data:`RANGES` dictionary, with the encoding name as a string and a list as a value containing the phred score and a tuple with the encoding range. For a given encoding range provided via the two first arguments, this function will return all possible encodings and phred scores. Parameters ---------- rmin : int Minimum Unicode code in range. rmax : int Maximum Unicode code in range. Returns ------- valid_encodings : list List of all possible encodings for the provided range. valid_phred : list List of all possible phred scores. """ valid_encodings = [] valid_phred = [] for encoding, (phred, (emin, emax)) in RANGES.items(): if rmin >= emin and rmax <= emax: valid_encodings.append(encoding) valid_phred.append(phred) return valid_encodings, valid_phred
@MainWrapper def main(sample_id, fastq_pair, gsize, minimum_coverage, opts): """ Main executor of the integrity_coverage template. Parameters ---------- sample_id : str Sample Identification string. fastq_pair : list Two element list containing the paired FastQ files. gsize : float or int Estimate of genome size in Mb. minimum_coverage : float or int Minimum coverage required for a sample to pass the coverage check opts : list List of arbitrary options. See `Expected input`_. """ logger.info("Starting integrity coverage main") # Check for runtime options if "-e" in opts: skip_encoding = True else: skip_encoding = False # Information for encoding guess gmin, gmax = 99, 0 encoding = [] phred = None # Information for coverage estimation chars = 0 nreads = 0 # Information on maximum read length max_read_length = 0 # Get compression of each FastQ pair file file_objects = [] for fastq in fastq_pair: logger.info("Processing file {}".format(fastq)) logger.info("[{}] Guessing file compression".format(fastq)) ftype = guess_file_compression(fastq) # This can guess the compression of gz, bz2 and zip. If it cannot # find the compression type, it tries to open a regular file if ftype: logger.info("[{}] Found file compression: {}".format( fastq, ftype)) file_objects.append(COPEN[ftype](fastq, "rt")) else: logger.info("[{}] File compression not found. Assuming an " "uncompressed file".format(fastq)) file_objects.append(open(fastq)) logger.info("Starting FastQ file parsing") # The '*_encoding' file stores a string with the encoding ('Sanger') # If no encoding is guessed, 'None' should be stored # The '*_phred' file stores a string with the phred score ('33') # If no phred is guessed, 'None' should be stored # The '*_coverage' file stores the estimated coverage ('88') # The '*_report' file stores a csv report of the file # The '*_max_len' file stores a string with the maximum contig len ('155') with open("{}_encoding".format(sample_id), "w") as enc_fh, \ open("{}_phred".format(sample_id), "w") as phred_fh, \ open("{}_coverage".format(sample_id), "w") as cov_fh, \ open("{}_report".format(sample_id), "w") as cov_rep, \ open("{}_max_len".format(sample_id), "w") as len_fh, \ open(".report.json", "w") as json_report, \ open(".status", "w") as status_fh, \ open(".fail", "w") as fail_fh: try: # Iterate over both pair files sequentially using itertools.chain for i, line in enumerate(chain(*file_objects)): # Parse only every 4th line of the file for the encoding # e.g.: AAAA/EEEEEEEEEEE<EEEEEEEEEEEEEEEEEEEEEEEEE (...) if (i + 1) % 4 == 0 and not skip_encoding: # It is important to strip() the line so that any newline # character is removed and not accounted for in the # encoding guess lmin, lmax = get_qual_range(line.strip()) # Guess new encoding if the range expands the previously # set boundaries of gmin and gmax if lmin < gmin or lmax > gmax: gmin, gmax = min(lmin, gmin), max(lmax, gmax) encoding, phred = get_encodings_in_range(gmin, gmax) logger.debug( "Updating estimates at line {} with range {} to" " '{}' (encoding) and '{}' (phred)".format( i, [lmin, lmax], encoding, phred)) # Parse only every 2nd line of the file for the coverage # e.g.: GGATAATCTACCTTGACGATTTGTACTGGCGTTGGTTTCTTA (...) if (i + 3) % 4 == 0: read_len = len(line.strip()) chars += read_len nreads += 1 # Evaluate maximum read length for sample if read_len > max_read_length: logger.debug("Updating maximum read length at line " "{} to {}".format(i, read_len)) max_read_length = read_len # End of FastQ parsing logger.info("Finished FastQ file parsing") # The minimum expected coverage for a sample to pass exp_coverage = round(chars / (gsize * 1e6), 2) # Set json report if "-e" not in opts: json_dic = { "tableRow": [{ "sample": sample_id, "data": [ {"header": "Raw BP", "value": chars, "table": "qc", "columnBar": True}, {"header": "Reads", "value": nreads, "table": "qc", "columnBar": True}, {"header": "Coverage", "value": exp_coverage, "table": "qc", "columnBar": True, "failThreshold": minimum_coverage } ] }], "plotData": [{ "sample": sample_id, "data": { "sparkline": chars } }], } else: json_dic = { "tableRow": [{ "sample": sample_id, "data": [ {"header": "Coverage", "value": exp_coverage, "table": "qc", "columnBar": True, "failThreshold": minimum_coverage } ], }], } # Get encoding if len(encoding) > 0: encoding = set(encoding) phred = set(phred) # Get encoding and phred as strings # e.g. enc: Sanger, Illumina-1.8 # e.g. phred: 64 enc = "{}".format(",".join([x for x in encoding])) phred = "{}".format(",".join(str(x) for x in phred)) logger.info("Encoding set to {}".format(enc)) logger.info("Phred set to {}".format(enc)) enc_fh.write(enc) phred_fh.write(phred) # Encoding not found else: if not skip_encoding: encoding_msg = "Could not guess encoding and phred from " \ "FastQ" logger.warning(encoding_msg) json_dic["warnings"] = [{ "sample": sample_id, "table": "qc", "value": [encoding_msg] }] enc_fh.write("None") phred_fh.write("None") # Estimate coverage logger.info("Estimating coverage based on a genome size of " "{}".format(gsize)) logger.info("Expected coverage is {}".format(exp_coverage)) if exp_coverage >= minimum_coverage: cov_rep.write("{},{},{}\\n".format( sample_id, str(exp_coverage), "PASS")) cov_fh.write(str(exp_coverage)) status_fh.write("pass") # Estimated coverage does not pass minimum threshold else: fail_msg = "Sample with low coverage ({}), below the {} " \ "threshold".format(exp_coverage, minimum_coverage) logger.error(fail_msg) fail_fh.write(fail_msg) cov_fh.write("fail") status_fh.write("fail") cov_rep.write("{},{},{}\\n".format( sample_id, str(exp_coverage), "FAIL")) json_dic["fail"] = [{ "sample": sample_id, "table": "qc", "value": [fail_msg] }] json_report.write(json.dumps(json_dic, separators=(",", ":"))) # Maximum read length len_fh.write("{}".format(max_read_length)) # This exception is raised when the input FastQ files are corrupted except EOFError: logger.error("The FastQ files could not be correctly " "parsed. They may be corrupt") for fh in [enc_fh, phred_fh, cov_fh, cov_rep, len_fh]: fh.write("corrupt") status_fh.write("fail") fail_fh.write("Could not read/parse FastQ. " "Possibly corrupt file") if __name__ == "__main__": main(SAMPLE_ID, FASTQ_PAIR, GSIZE, MINIMUM_COVERAGE, OPTS)