import pysam
import concurrent.futures
import argparse
import time
start_time = time.time()
def remap_genotype(gt, ref, alt):
# Remap the genotype based on swapped alleles.
if isinstance(gt, tuple): # If gt is a tuple, convert to string
gt = "/".join(str(x) for x in gt)
gt_fields = gt.split("/")
remapped_fields = []
#Basically takes the number and then it appends the actual letter to the list called remapped_fields
for field in gt_fields:
if field == "0":
remapped_fields.append(ref)
elif field == "1":
remapped_fields.append(alt)
else:
remapped_fields.append(".")
return "/".join(remapped_fields)
def compare_genotypes(record1, record2, allele_map):
differences = 0
if record1.pos == record2.pos and record1.chrom == record2.chrom:
ref1, alt1 = allele_map.get(record1.id, (record1.ref, record1.alts[0])) # Get record 1's reference and record 1's alternate alleles via their ID
ref2, alt2 = allele_map.get(record2.id, (record2.ref, record2.alts[0])) # Same thing as above but you basically just get it for record 2 nothing too complciated, could possibly made it faster with like use of tabix or something but yeah
samples1 = list(record1.samples.values()) # Get all samples in file 1
samples2 = list(record2.samples.values()) # Get all samples in file 2
for i in range(len(samples1)):
sample1 = samples1[i]
sample2 = samples2[i]
gt1 = sample1["GT"]
gt2 = sample2["GT"]
# Remap genotypes if references don't match or if the alterante don't match up
if ref1 != ref2 or alt1 != alt2:
gt1 = remap_genotype(gt1, ref1, alt1)
gt2 = remap_genotype(gt2, ref2, alt2)
# Compare the remapped genotypes via the remapped genotypes list which is now represented in G1, G2
if gt1 != gt2:
differences += 1
print(f"Position: {record1.chrom}:{record1.pos}")
print(f"Sample {i}:")
print(f"GT1: {gt1}, GT2: {gt2}")
print(f"Ref1: {ref1}, Alt1: {alt1}")
print(f"Ref2: {ref2}, Alt2: {alt2}")
print("-" * 40)
return differences
def compare_vcf_files(vcf_file1, vcf_file2, dev_mode=False): #If nothing's passed then no dev mode and it'll just pass out the thing
differences = 0
allele_map = {} # A dictionary to store ref and alt alleles for each record ID
with pysam.VariantFile(vcf_file1) as vcf1, pysam.VariantFile(vcf_file2) as vcf2:
for record in vcf1:
allele_map[record.id] = (record.ref, record.alts[0]) # Assuming single alternate allele
# Use concurrent processing to compare genotypes
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for record1 in vcf1.fetch():
for record2 in vcf2.fetch(record1.chrom, record1.pos, record1.pos + 1):
future = executor.submit(compare_genotypes, record1, record2, allele_map)
futures.append(future)
for future in concurrent.futures.as_completed(futures):
differences += future.result()
return differences
'''
Usage case:
python compare.py vcf_file_1 vcf_file_2 --dev_mode
Basically dev mode gives u a nice prints so you can see what it thinks/counts as a mutation
Right now it does not consider the indel mutations (if for example ref1 does not match up with ref2, but ref2 is longer it will still count it as a mutation for now) but that should be an easy check with the compare_genotypes for length
But for now since things have been kinda busy I'm just gonna send what I have right now, since we have to deal with indel mutations later anyway
'''
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compare two VCF files and find genotype differences.")
parser.add_argument("vcf_file1", type=str, help="Path to the first VCF file")
parser.add_argument("vcf_file2", type=str, help="Path to the second VCF file")
parser.add_argument("--dev_mode", action="store_true", help="Enable developer mode with print statements")
args = parser.parse_args()
differences = compare_vcf_files(args.vcf_file1, args.vcf_file2, args.dev_mode)
# Time execution (some rando thing I copied from stack overflow to print the time):
end_time = time.time()
total_time = end_time - start_time
minutes = int(total_time // 60)
seconds = total_time % 60
print("Number of genotype differences:", differences)
print(f"Execution time: {minutes} minutes and {seconds:.2f} seconds")
# Before it was commandline parsed
# vcf_file1 = "/home/will/Downloads/mock_data_vcf_1/NDAR-INV119XNUX_S2_merged_L001_markdup_recalibrated_Haplotyper_MOCKDATA.vcf.gz"
# vcf_file2 = "/home/will/Downloads/mock_data_vcf_1/NDAR-INV1GBYJF8_L001_markdup_recalibrated_Haplotyper_MOCKDATA.vcf.gz"
# differences = compare_vcf_files(vcf_file1, vcf_file2)
# print("Number of genotype differences:", differences)