#!/usr/bin/env python3
"""
Purpose
-------
This module is intended to run FastQC on paired-end FastQ files.
Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``fastq_pair`` : *Pair of FastQ file paths*
- e.g.: ``'SampleA_1.fastq.gz SampleA_2.fastq.gz'``
Generated output
----------------
The generated output are output files that contain an object, usually a string.
- ``pair_{1,2}_data`` : File containing FastQC report at the nucleotide level\
for each pair
- e.g.: ``'pair_1_data'`` and ``'pair_2_data'``
- ``pair_{1,2}_summary``: File containing FastQC report for each category and\
for each pair
- e.g.: ``'pair_1_summary'`` and ``'pair_2_summary'``
Code documentation
------------------
"""
__version__ = "1.0.1"
__build__ = "28032018"
__template__ = "fastqc-nf"
import os
import subprocess
from subprocess import PIPE
from os.path import exists, join
from flowcraft_utils.flowcraft_base import get_logger, MainWrapper
logger = get_logger(__file__)
def __get_version_fastqc():
try:
cli = ["fastqc", "--version"]
p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE)
stdout, _ = p.communicate()
version = stdout.strip().split()[1][1:].decode("utf8")
except Exception as e:
logger.debug(e)
version = "undefined"
return {
"program": "FastQC",
"version": version,
}
if __file__.endswith(".command.sh"):
FASTQ_PAIR = '$fastq_pair'.split()
ADAPTER_FILE = '$ad'
CPUS = '$task.cpus'
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("FASTQ_PAIR: {}".format(FASTQ_PAIR))
logger.debug("ADAPTER_FILE: {}".format(ADAPTER_FILE))
logger.debug("CPUS: {}".format(CPUS))
[docs]def convert_adatpers(adapter_fasta):
"""Generates an adapter file for FastQC from a fasta file.
The provided adapters file is assumed to be a simple fasta file with the
adapter's name as header and the corresponding sequence::
>TruSeq_Universal_Adapter
AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
>TruSeq_Adapter_Index 1
GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
Parameters
----------
adapter_fasta : str
Path to Fasta file with adapter sequences.
Returns
-------
adapter_out : str or None
The path to the reformatted adapter file. Returns ``None`` if the
adapters file does not exist or the path is incorrect.
"""
adapter_out = "fastqc_adapters.tab"
logger.debug("Setting output adapters file to: {}".format(adapter_out))
try:
with open(adapter_fasta) as fh, \
open(adapter_out, "w") as adap_fh:
for line in fh:
if line.startswith(">"):
head = line[1:].strip()
# Get the next line with the sequence string
sequence = next(fh).strip()
adap_fh.write("{}\\t{}\\n".format(head, sequence))
logger.info("Converted adapters file")
return adapter_out
# If an invalid adapters file is provided, return None.
except FileNotFoundError:
logger.warning("Could not find the provided adapters file: {}".format(
adapter_fasta))
return
@MainWrapper
def main(fastq_pair, adapter_file, cpus):
""" Main executor of the fastq template.
Parameters
----------
fastq_pair : list
Two element list containing the paired FastQ files.
adapter_file : str
Path to adapters file.
cpus : int or str
Number of cpu's that will be by FastQC.
"""
logger.info("Starting fastqc")
# If an adapter file was provided, convert it to FastQC format
if os.path.exists(adapter_file):
logger.info("Adapters file provided: {}".format(adapter_file))
adapters = convert_adatpers(adapter_file)
else:
logger.info("Adapters file '{}' not provided or does not "
"exist".format(adapter_file))
adapters = None
# Setting command line for FastQC
cli = [
"fastqc",
"--extract",
"--nogroup",
"--format",
"fastq",
"--threads",
str(cpus)
]
# Add adapters file to command line, if it exists
if adapters:
cli += ["--adapters", "{}".format(adapters)]
# Add FastQ files at the end of command line
cli += fastq_pair
logger.debug("Running fastqc subprocess with command: {}".format(cli))
p = subprocess.Popen(cli, stdout=PIPE, stderr=PIPE, shell=False)
stdout, stderr = p.communicate()
# Attempt to decode STDERR output from bytes. If unsuccessful, coerce to
# string
try:
stderr = stderr.decode("utf8")
except (UnicodeDecodeError, AttributeError):
stderr = str(stderr)
logger.info("Finished fastqc subprocess with STDOUT:\\n"
"======================================\\n{}".format(stdout))
logger.info("Fished fastqc subprocesswith STDERR:\\n"
"======================================\\n{}".format(stderr))
logger.info("Finished fastqc with return code: {}".format(
p.returncode))
logger.info("Checking if FastQC output was correctly generated")
# Check if the FastQC output was correctly generated.
with open(".status", "w") as status_fh:
for fastq in fastq_pair:
fpath = join(fastq.rsplit(".", 2)[0] + "_fastqc",
"fastqc_data.txt")
logger.debug("Checking path: {}".format(fpath))
# If the FastQC output does not exist, pass the STDERR to
# the output status channel and exit
if not exists(fpath):
logger.warning("Path does not exist: {}".format(fpath))
status_fh.write("fail")
return
logger.debug("Found path: {}".format(fpath))
# If the output directories exist, write 'pass' to the output status
# channel
status_fh.write("pass")
logger.info("Retrieving relevant FastQC output files")
# Both FastQC have been correctly executed. Get the relevant FastQC
# output files for the output channel
for i, fastq in enumerate(fastq_pair):
# Get results for each pair
fastqc_dir = fastq.rsplit(".", 2)[0] + "_fastqc"
summary_file = join(fastqc_dir, "summary.txt")
logger.debug("Retrieving summary file: {}".format(summary_file))
fastqc_data_file = join(fastqc_dir, "fastqc_data.txt")
logger.debug("Retrieving data file: {}".format(fastqc_data_file))
# Rename output files to a file name that is easier to handle in the
# output channel
os.rename(fastqc_data_file, "pair_{}_data".format(i + 1))
os.rename(summary_file, "pair_{}_summary".format(i + 1))
if __name__ == "__main__":
main(FASTQ_PAIR, ADAPTER_FILE, CPUS)