BLASTP to MCL Clustering Pipeline

Developed by: @SEL-nivedi

Python Bioinformatics Genomics
blastp_mcl_pipeline.py
#!/nivedi/bin/env python3
"""
BLASTP to MCL Clustering Pipeline (Starting from existing GFF files)
-----------------------------------------------------------------
Developed by: @SEL-nivedi
Version: 1.0
License: MIT
-----------------------------------------------------------------

Purpose:
This script automates the process of ortholog/paralog identification by:
1. Extracting protein sequences from existing GFF files
2. Running BLASTP for sequence similarity comparisons
3. Converting BLAST results to MCL-compatible input format
4. Running MCL (Markov Cluster Algorithm) for protein clustering

Requirements:
- Python 3.6+
- BioPython
- NCBI BLAST+ tools (blastp, makeblastdb)
- MCL (Markov Cluster Algorithm)

Usage:
python blastp_mcl_pipeline.py --gff_dir directory_with_gff_files/ --output_dir results/ --inflation 1.5
"""
import os
import sys
import argparse
import subprocess
import shutil
import logging
from pathlib import Path
from Bio import SeqIO

# Configure logging to track progress and errors during execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()

def parse_arguments():
    """
    Parse command-line arguments passed to the script.
    
    Returns:
        argparse.Namespace: Object containing all the arguments and their values
    """
    parser = argparse.ArgumentParser(description="BLASTP to MCL Clustering Pipeline")
    # Required argument for directory containing GFF files
    parser.add_argument("--gff_dir", required=True, help="Directory containing GFF files")
    # Optional arguments with default values
    parser.add_argument("--output_dir", default="results", help="Output directory for all results")
    parser.add_argument("--inflation", type=float, default=1.5, 
                        help="MCL inflation parameter (controls cluster granularity)")
    parser.add_argument("--evalue", type=float, default=1e-5, 
                        help="BLAST E-value threshold for hit significance")
    parser.add_argument("--threads", type=int, default=4, 
                        help="Number of CPU threads for BLASTP parallelization")
    return parser.parse_args()

def check_dependencies():
    """
    Check if required external tools are installed and available in the system PATH.
    
    Exits the program if any of the required tools are missing.
    """
    # List of external tools required by this pipeline
    tools = ["blastp", "makeblastdb", "mcl"]
    missing = []
    
    # Check each tool using shutil.which (returns None if not found)
    for tool in tools:
        if shutil.which(tool) is None:
            missing.append(tool)
    
    # If any tools are missing, exit with an error
    if missing:
        logger.error(f"Missing required tools: {', '.join(missing)}. Please install them and try again.")
        sys.exit(1)
    
    logger.info("All required tools found.")

def extract_proteins_from_gff(gff_dir, output_dir):
    """
    Extract protein sequences from GFF files or their corresponding .faa files.
    
    Args:
        gff_dir (str): Directory containing GFF files and possibly .faa files
        output_dir (str): Directory to store output files
    
    Returns:
        str: Path to the combined protein FASTA file
        
    Note: This implementation assumes .faa files are available alongside GFF files.
          Additional code would be needed to extract proteins directly from GFF.
    """
    logger.info(f"Extracting protein sequences from GFF files in {gff_dir}")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Find all GFF files in the specified directory
    gff_files = [f for f in os.listdir(gff_dir) if f.endswith('.gff') or f.endswith('.gff3')]
    
    # Check if any GFF files were found
    if not gff_files:
        logger.error(f"No GFF files found in {gff_dir}")
        sys.exit(1)
    
    protein_files = []
    combined_fasta = os.path.join(output_dir, "all_proteins.faa")
    
    # Create a single combined protein FASTA file from all sources
    with open(combined_fasta, 'w') as combined_out:
        for gff_file in gff_files:
            gff_path = os.path.join(gff_dir, gff_file)
            base_name = os.path.splitext(gff_file)[0]
            
            # Look for a corresponding .faa file (protein sequences)
            faa_file = os.path.join(gff_dir, f"{base_name}.faa")
            
            if os.path.exists(faa_file):
                logger.info(f"Found protein FASTA file for {gff_file}: {faa_file}")
                
                # Append protein sequences to combined file with modified headers
                # to prevent ID conflicts between different genomes/species
                with open(faa_file) as faa_in:
                    for line in faa_in:
                        if line.startswith('>'):
                            # Prefix protein IDs with the source file name to avoid ID conflicts
                            header = line.strip().replace('>', f'>{base_name}_')
                            combined_out.write(f"{header}\n")
                        else:
                            combined_out.write(line)
                
                protein_files.append(faa_file)
            else:
                logger.warning(f"No corresponding .faa file found for {gff_file}")
                # TODO: Implement extraction of protein sequences directly from GFF
                # This would require parsing the GFF and extracting/translating CDS features
    
    # Check if any protein files were found and processed
    if not protein_files:
        logger.error("No protein sequences extracted. Exiting.")
        sys.exit(1)
    
    logger.info(f"Combined protein sequences into {combined_fasta}")
    return combined_fasta

def create_blast_database(fasta_file, output_dir):
    """
    Create a BLAST protein database from the combined FASTA file.
    
    Args:
        fasta_file (str): Path to the combined protein FASTA file
        output_dir (str): Directory to store the BLAST database files
    
    Returns:
        str: Path to the created BLAST database
        
    Note: Uses NCBI's makeblastdb tool to create a protein database
    """
    logger.info("Creating BLAST database")
    
    # Define the database name/path
    db_path = os.path.join(output_dir, "blastdb")
    
    # Command to create a protein BLAST database
    cmd = f"makeblastdb -in {fasta_file} -dbtype prot -out {db_path}"
    
    try:
        # Run the command and check for errors
        subprocess.run(cmd, shell=True, check=True)
        logger.info(f"BLAST database created at {db_path}")
        return db_path
    except subprocess.CalledProcessError as e:
        logger.error(f"Error creating BLAST database: {e}")
        sys.exit(1)

def run_blastp(query_fasta, db_path, output_dir, evalue, threads):
    """
    Run BLASTP to compare all protein sequences against each other.
    
    Args:
        query_fasta (str): Path to the query protein FASTA file
        db_path (str): Path to the BLAST database
        output_dir (str): Directory to store BLAST results
        evalue (float): E-value threshold for reporting hits
        threads (int): Number of CPU threads to use
    
    Returns:
        str: Path to the BLAST results file
        
    Note: The output format is tabular (outfmt 6) with 12 standard fields
    """
    logger.info("Running BLASTP comparison")
    
    # Define the BLAST output file
    blast_output = os.path.join(output_dir, "blastp_results.tsv")
    
    # Construct the BLASTP command with all parameters
    # -outfmt 6 produces tab-separated values with 12 standard fields
    cmd = (
        f"blastp -query {query_fasta} -db {db_path} -out {blast_output} "
        f"-outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' "
        f"-evalue {evalue} -num_threads {threads}"
    )
    
    try:
        # Run BLASTP and check for errors
        subprocess.run(cmd, shell=True, check=True)
        logger.info(f"BLASTP completed, results saved to {blast_output}")
        return blast_output
    except subprocess.CalledProcessError as e:
        logger.error(f"Error running BLASTP: {e}")
        sys.exit(1)

def convert_blast_to_abc(blast_output, output_dir):
    """
    Convert BLAST results to MCL input format (ABC format).
    
    The ABC format is a simple three-column format:
    source_node target_node edge_weight
    
    Args:
        blast_output (str): Path to the BLAST results file
        output_dir (str): Directory to store the converted file
    
    Returns:
        str: Path to the MCL input file in ABC format
        
    Note: Uses bit score as edge weight, and filters out self-hits
    """
    logger.info("Converting BLAST output to MCL input format")
    
    # Define the output ABC file for MCL
    abc_file = os.path.join(output_dir, "mcl_input.abc")
    
    try:
        # Process BLAST results line by line
        with open(blast_output) as infile, open(abc_file, 'w') as outfile:
            for line in infile:
                fields = line.strip().split('\t')
                
                # Skip malformed lines
                if len(fields) < 12:
                    continue
                
                query = fields[0]  # Query protein ID
                subject = fields[1]  # Subject protein ID
                
                # Skip self-hits (protein matched to itself)
                if query == subject:
                    continue
                
                # Use bit score as similarity measure (higher is better)
                # Could alternatively use -log10(E-value)
                bitscore = fields[11]
                
                # Write to ABC format: query subject weight
                outfile.write(f"{query}\t{subject}\t{bitscore}\n")
        
        logger.info(f"MCL input file created: {abc_file}")
        return abc_file
    except Exception as e:
        logger.error(f"Error converting BLAST output to ABC format: {e}")
        sys.exit(1)

def run_mcl(abc_file, inflation, output_dir):
    """
    Run the Markov Cluster Algorithm (MCL) to identify protein clusters.
    
    Args:
        abc_file (str): Path to the MCL input file in ABC format
        inflation (float): MCL inflation parameter (controls cluster granularity)
        output_dir (str): Directory to store MCL results
    
    Returns:
        str: Path to the MCL clusters output file
        
    Note: The inflation parameter controls cluster granularity:
          - Lower values (e.g., 1.2) produce fewer, larger clusters
          - Higher values (e.g., 5.0) produce more, smaller clusters
    """
    logger.info(f"Running MCL with inflation parameter {inflation}")
    
    # Define the output clusters file
    clusters_file = os.path.join(output_dir, "clusters.txt")
    
    # Construct MCL command
    # --abc flag indicates the input is in ABC format
    cmd = f"mcl {abc_file} --abc -I {inflation} -o {clusters_file}"
    
    try:
        # Run MCL and check for errors
        subprocess.run(cmd, shell=True, check=True)
        logger.info(f"MCL clustering completed. Results saved to {clusters_file}")
        return clusters_file
    except subprocess.CalledProcessError as e:
        logger.error(f"Error running MCL: {e}")
        sys.exit(1)

def main():
    """
    Main function to orchestrate the entire pipeline workflow.
    
    Pipeline steps:
    1. Parse arguments and check dependencies
    2. Extract protein sequences from GFF/FAA files
    3. Create BLAST database
    4. Run BLASTP
    5. Convert BLAST results to MCL input
    6. Run MCL clustering
    7. Report results
    """
    # Parse command-line arguments
    args = parse_arguments()
    
    # Check if all required external tools are available
    check_dependencies()
    
    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)
    
    # Step 1: Extract proteins from GFF files (or associated FAA files)
    logger.info("Step 1: Extracting protein sequences from GFF files")
    combined_fasta = extract_proteins_from_gff(args.gff_dir, args.output_dir)
    
    # Step 2: Create BLAST database from the combined protein sequences
    logger.info("Step 2: Creating BLAST database")
    db_path = create_blast_database(combined_fasta, args.output_dir)
    
    # Step 3: Run BLASTP to identify sequence similarities
    logger.info("Step 3: Running BLASTP")
    blast_output = run_blastp(combined_fasta, db_path, args.output_dir, args.evalue, args.threads)
    
    # Step 4: Convert BLAST output to MCL input format (ABC)
    logger.info("Step 4: Converting BLAST output to MCL input format")
    abc_file = convert_blast_to_abc(blast_output, args.output_dir)
    
    # Step 5: Run MCL to identify protein clusters
    logger.info("Step 5: Running MCL algorithm")
    clusters_file = run_mcl(abc_file, args.inflation, args.output_dir)
    
    # Report completion and provide summary statistics
    logger.info("Pipeline completed successfully!")
    logger.info(f"Results are available in {args.output_dir}/clusters.txt")
    
    # Print basic statistics about the clusters
    with open(clusters_file) as f:
        clusters = [line.strip().split('\t') for line in f]
    
    logger.info(f"Total number of clusters: {len(clusters)}")
    logger.info(f"Largest cluster size: {max(len(cluster) for cluster in clusters)} proteins")

if __name__ == "__main__":
    main()

Features

  • Automated protein sequence extraction from GFF files
  • BLASTP sequence similarity analysis
  • MCL clustering for protein family identification
  • Configurable parameters (e-value, inflation)
  • Detailed logging and error handling

Requirements

  • Python 3.6+
  • BioPython package
  • NCBI BLAST+ tools
  • MCL (Markov Cluster Algorithm)
  • GFF/FASTA formatted input files

Usage

python blastp_mcl_pipeline.py
--gff_dir /path/to/gff/
--output_dir results/
--inflation 1.5
--evalue 1e-5
--threads 4