BLASTP to MCL Clustering Pipeline
Developed by: @SEL-nivedi
Python
Bioinformatics
Genomics
blastp_mcl_pipeline.py
#!/nivedi/bin/env python3
"""
BLASTP to MCL Clustering Pipeline (Starting from existing GFF files)
-----------------------------------------------------------------
Developed by: @SEL-nivedi
Version: 1.0
License: MIT
-----------------------------------------------------------------
Purpose:
This script automates the process of ortholog/paralog identification by:
1. Extracting protein sequences from existing GFF files
2. Running BLASTP for sequence similarity comparisons
3. Converting BLAST results to MCL-compatible input format
4. Running MCL (Markov Cluster Algorithm) for protein clustering
Requirements:
- Python 3.6+
- BioPython
- NCBI BLAST+ tools (blastp, makeblastdb)
- MCL (Markov Cluster Algorithm)
Usage:
python blastp_mcl_pipeline.py --gff_dir directory_with_gff_files/ --output_dir results/ --inflation 1.5
"""
import os
import sys
import argparse
import subprocess
import shutil
import logging
from pathlib import Path
from Bio import SeqIO
# Configure logging to track progress and errors during execution
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()
def parse_arguments():
"""
Parse command-line arguments passed to the script.
Returns:
argparse.Namespace: Object containing all the arguments and their values
"""
parser = argparse.ArgumentParser(description="BLASTP to MCL Clustering Pipeline")
# Required argument for directory containing GFF files
parser.add_argument("--gff_dir", required=True, help="Directory containing GFF files")
# Optional arguments with default values
parser.add_argument("--output_dir", default="results", help="Output directory for all results")
parser.add_argument("--inflation", type=float, default=1.5,
help="MCL inflation parameter (controls cluster granularity)")
parser.add_argument("--evalue", type=float, default=1e-5,
help="BLAST E-value threshold for hit significance")
parser.add_argument("--threads", type=int, default=4,
help="Number of CPU threads for BLASTP parallelization")
return parser.parse_args()
def check_dependencies():
"""
Check if required external tools are installed and available in the system PATH.
Exits the program if any of the required tools are missing.
"""
# List of external tools required by this pipeline
tools = ["blastp", "makeblastdb", "mcl"]
missing = []
# Check each tool using shutil.which (returns None if not found)
for tool in tools:
if shutil.which(tool) is None:
missing.append(tool)
# If any tools are missing, exit with an error
if missing:
logger.error(f"Missing required tools: {', '.join(missing)}. Please install them and try again.")
sys.exit(1)
logger.info("All required tools found.")
def extract_proteins_from_gff(gff_dir, output_dir):
"""
Extract protein sequences from GFF files or their corresponding .faa files.
Args:
gff_dir (str): Directory containing GFF files and possibly .faa files
output_dir (str): Directory to store output files
Returns:
str: Path to the combined protein FASTA file
Note: This implementation assumes .faa files are available alongside GFF files.
Additional code would be needed to extract proteins directly from GFF.
"""
logger.info(f"Extracting protein sequences from GFF files in {gff_dir}")
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Find all GFF files in the specified directory
gff_files = [f for f in os.listdir(gff_dir) if f.endswith('.gff') or f.endswith('.gff3')]
# Check if any GFF files were found
if not gff_files:
logger.error(f"No GFF files found in {gff_dir}")
sys.exit(1)
protein_files = []
combined_fasta = os.path.join(output_dir, "all_proteins.faa")
# Create a single combined protein FASTA file from all sources
with open(combined_fasta, 'w') as combined_out:
for gff_file in gff_files:
gff_path = os.path.join(gff_dir, gff_file)
base_name = os.path.splitext(gff_file)[0]
# Look for a corresponding .faa file (protein sequences)
faa_file = os.path.join(gff_dir, f"{base_name}.faa")
if os.path.exists(faa_file):
logger.info(f"Found protein FASTA file for {gff_file}: {faa_file}")
# Append protein sequences to combined file with modified headers
# to prevent ID conflicts between different genomes/species
with open(faa_file) as faa_in:
for line in faa_in:
if line.startswith('>'):
# Prefix protein IDs with the source file name to avoid ID conflicts
header = line.strip().replace('>', f'>{base_name}_')
combined_out.write(f"{header}\n")
else:
combined_out.write(line)
protein_files.append(faa_file)
else:
logger.warning(f"No corresponding .faa file found for {gff_file}")
# TODO: Implement extraction of protein sequences directly from GFF
# This would require parsing the GFF and extracting/translating CDS features
# Check if any protein files were found and processed
if not protein_files:
logger.error("No protein sequences extracted. Exiting.")
sys.exit(1)
logger.info(f"Combined protein sequences into {combined_fasta}")
return combined_fasta
def create_blast_database(fasta_file, output_dir):
"""
Create a BLAST protein database from the combined FASTA file.
Args:
fasta_file (str): Path to the combined protein FASTA file
output_dir (str): Directory to store the BLAST database files
Returns:
str: Path to the created BLAST database
Note: Uses NCBI's makeblastdb tool to create a protein database
"""
logger.info("Creating BLAST database")
# Define the database name/path
db_path = os.path.join(output_dir, "blastdb")
# Command to create a protein BLAST database
cmd = f"makeblastdb -in {fasta_file} -dbtype prot -out {db_path}"
try:
# Run the command and check for errors
subprocess.run(cmd, shell=True, check=True)
logger.info(f"BLAST database created at {db_path}")
return db_path
except subprocess.CalledProcessError as e:
logger.error(f"Error creating BLAST database: {e}")
sys.exit(1)
def run_blastp(query_fasta, db_path, output_dir, evalue, threads):
"""
Run BLASTP to compare all protein sequences against each other.
Args:
query_fasta (str): Path to the query protein FASTA file
db_path (str): Path to the BLAST database
output_dir (str): Directory to store BLAST results
evalue (float): E-value threshold for reporting hits
threads (int): Number of CPU threads to use
Returns:
str: Path to the BLAST results file
Note: The output format is tabular (outfmt 6) with 12 standard fields
"""
logger.info("Running BLASTP comparison")
# Define the BLAST output file
blast_output = os.path.join(output_dir, "blastp_results.tsv")
# Construct the BLASTP command with all parameters
# -outfmt 6 produces tab-separated values with 12 standard fields
cmd = (
f"blastp -query {query_fasta} -db {db_path} -out {blast_output} "
f"-outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore' "
f"-evalue {evalue} -num_threads {threads}"
)
try:
# Run BLASTP and check for errors
subprocess.run(cmd, shell=True, check=True)
logger.info(f"BLASTP completed, results saved to {blast_output}")
return blast_output
except subprocess.CalledProcessError as e:
logger.error(f"Error running BLASTP: {e}")
sys.exit(1)
def convert_blast_to_abc(blast_output, output_dir):
"""
Convert BLAST results to MCL input format (ABC format).
The ABC format is a simple three-column format:
source_node target_node edge_weight
Args:
blast_output (str): Path to the BLAST results file
output_dir (str): Directory to store the converted file
Returns:
str: Path to the MCL input file in ABC format
Note: Uses bit score as edge weight, and filters out self-hits
"""
logger.info("Converting BLAST output to MCL input format")
# Define the output ABC file for MCL
abc_file = os.path.join(output_dir, "mcl_input.abc")
try:
# Process BLAST results line by line
with open(blast_output) as infile, open(abc_file, 'w') as outfile:
for line in infile:
fields = line.strip().split('\t')
# Skip malformed lines
if len(fields) < 12:
continue
query = fields[0] # Query protein ID
subject = fields[1] # Subject protein ID
# Skip self-hits (protein matched to itself)
if query == subject:
continue
# Use bit score as similarity measure (higher is better)
# Could alternatively use -log10(E-value)
bitscore = fields[11]
# Write to ABC format: query subject weight
outfile.write(f"{query}\t{subject}\t{bitscore}\n")
logger.info(f"MCL input file created: {abc_file}")
return abc_file
except Exception as e:
logger.error(f"Error converting BLAST output to ABC format: {e}")
sys.exit(1)
def run_mcl(abc_file, inflation, output_dir):
"""
Run the Markov Cluster Algorithm (MCL) to identify protein clusters.
Args:
abc_file (str): Path to the MCL input file in ABC format
inflation (float): MCL inflation parameter (controls cluster granularity)
output_dir (str): Directory to store MCL results
Returns:
str: Path to the MCL clusters output file
Note: The inflation parameter controls cluster granularity:
- Lower values (e.g., 1.2) produce fewer, larger clusters
- Higher values (e.g., 5.0) produce more, smaller clusters
"""
logger.info(f"Running MCL with inflation parameter {inflation}")
# Define the output clusters file
clusters_file = os.path.join(output_dir, "clusters.txt")
# Construct MCL command
# --abc flag indicates the input is in ABC format
cmd = f"mcl {abc_file} --abc -I {inflation} -o {clusters_file}"
try:
# Run MCL and check for errors
subprocess.run(cmd, shell=True, check=True)
logger.info(f"MCL clustering completed. Results saved to {clusters_file}")
return clusters_file
except subprocess.CalledProcessError as e:
logger.error(f"Error running MCL: {e}")
sys.exit(1)
def main():
"""
Main function to orchestrate the entire pipeline workflow.
Pipeline steps:
1. Parse arguments and check dependencies
2. Extract protein sequences from GFF/FAA files
3. Create BLAST database
4. Run BLASTP
5. Convert BLAST results to MCL input
6. Run MCL clustering
7. Report results
"""
# Parse command-line arguments
args = parse_arguments()
# Check if all required external tools are available
check_dependencies()
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
# Step 1: Extract proteins from GFF files (or associated FAA files)
logger.info("Step 1: Extracting protein sequences from GFF files")
combined_fasta = extract_proteins_from_gff(args.gff_dir, args.output_dir)
# Step 2: Create BLAST database from the combined protein sequences
logger.info("Step 2: Creating BLAST database")
db_path = create_blast_database(combined_fasta, args.output_dir)
# Step 3: Run BLASTP to identify sequence similarities
logger.info("Step 3: Running BLASTP")
blast_output = run_blastp(combined_fasta, db_path, args.output_dir, args.evalue, args.threads)
# Step 4: Convert BLAST output to MCL input format (ABC)
logger.info("Step 4: Converting BLAST output to MCL input format")
abc_file = convert_blast_to_abc(blast_output, args.output_dir)
# Step 5: Run MCL to identify protein clusters
logger.info("Step 5: Running MCL algorithm")
clusters_file = run_mcl(abc_file, args.inflation, args.output_dir)
# Report completion and provide summary statistics
logger.info("Pipeline completed successfully!")
logger.info(f"Results are available in {args.output_dir}/clusters.txt")
# Print basic statistics about the clusters
with open(clusters_file) as f:
clusters = [line.strip().split('\t') for line in f]
logger.info(f"Total number of clusters: {len(clusters)}")
logger.info(f"Largest cluster size: {max(len(cluster) for cluster in clusters)} proteins")
if __name__ == "__main__":
main()
Copied!
Features
- Automated protein sequence extraction from GFF files
- BLASTP sequence similarity analysis
- MCL clustering for protein family identification
- Configurable parameters (e-value, inflation)
- Detailed logging and error handling
Requirements
- Python 3.6+
- BioPython package
- NCBI BLAST+ tools
- MCL (Markov Cluster Algorithm)
- GFF/FASTA formatted input files
Usage
python blastp_mcl_pipeline.py
--gff_dir /path/to/gff/
--output_dir results/
--inflation 1.5
--evalue 1e-5
--threads 4
--gff_dir /path/to/gff/
--output_dir results/
--inflation 1.5
--evalue 1e-5
--threads 4