vcf_master = sys.argv[1]#the vcf file to compare the other vcf file to
vcf_compare = sys.argv[2]#the vcf file to compare
sample_name = sys.argv[3] #the sample to allocate
master = vcf.VCFReader(filename=vcf_master)
compare = vcf.VCFReader(filename=vcf_compare)
#create a dictionary which will use tuples as keys, e.g. vcf_master_dict[chr01][999] = GT
#loop thru each record in the master vcf
vcf_master_dict[(record.CHROM, record.POS)] = alleles
#loop thru each record in the compare vcf
if (record.CHROM, record.POS) in vcf_master_dict:
for sample in record.samples:
#check to confirm that the sample name (sys.argv[3]) is present in our compare vcf
if sample.sample == sample_name:
current_genotype = sample["GT"]
if current_genotype == '0/1' or current_genotype == '0|1':
if (vcf_master_dict[(record.CHROM, record.POS)] == record.alleles):
elif current_genotype == '0/0' or current_genotype == '0|0':
alleles = vcf_master_dict[(record.CHROM, record.POS)]
elif current_genotype == '1/1' or current_genotype == '1|1':
if (vcf_master_dict[(record.CHROM, record.POS)] == record.alleles):
elif current_genotype == './.' or current_genotype == '.|.':
sys.stderr.write("\nThe following genotype was not found in the master vcf: ")
sys.stderr.write(str(record))
sys.stderr.write(str(current_genotype))
sys.stderr.write(str(vcf_master_dict[record.CHROM, record.POS]))
sys.stderr.write("\nThe following record was not found in the master vcf: ")
sys.stderr.write(str(record))
#n_combined = n_matching_het + n_hom_ref + n_hom_alt
part_het = float(n_matching_het)/float(n_compare)
part_non_matching_het = float(n_non_matching_het)/float(n_compare)
part_hom_ref = float(n_hom_ref)/float(n_compare)
part_hom_alt = float(n_hom_alt)/float(n_compare)
part_not_called=float(n_not_called)/float(n_compare)
part_not_accouted=float(n_not_accouted)/float(n_compare)
part_not_in_compare=float(n_not_in_compare)/float(n_compare)
#How many records were missing from the compare vcf (which were present in the master vcf)
n_master_minus_n_compare=n_master - n_compare
part_not_found=float(n_master_minus_n_compare)/float(n_master)
count_num_compare=n_matching_het+n_non_matching_het+n_hom_ref+n_hom_alt+n_not_called+n_not_accouted
check_num_compare=float(count_num_compare)/float(n_compare)
check_parts_compare=part_non_matching_het+part_het+part_hom_ref+part_hom_alt+part_not_called+part_not_accouted+part_not_in_compare
sys.stderr.write("Number of sites in master "+ str(n_master)+ "\n")
sys.stderr.write("Number of sites in compare "+ str(n_compare)+ "\n")
sys.stderr.write("No. missing (not-called) sites: "+ str(n_master_minus_n_compare)+ "\n")
sys.stderr.write("Number of sites allocated a category: " + str(count_num_compare)+ "\n")
#print("Number + part of non-matching heterozygous sites " + str(n_non_matching_het) + "(" + str(part_non_matching_het) + ")" )
#print("Number + part of matching heterozygous sites " + str(n_matching_het) + "(" + str(part_het) + ")" )
#print("Number + part of matching homozygous ref sites " + str(n_hom_ref) + "(" + str(part_hom_ref) + ")" )
#print("Number + part of matching homozygous alt sites " + str(n_hom_alt) + "(" + str(part_hom_alt) + ")" )
# print("Number + part of sites not called " + str(n_not_called) + "(" + str(part_not_called) + ")" )
# print("Number + part of genotypes not accounted for " + str(n_not_accouted) + "(" + str(part_not_accouted) + ")" )
# print("Number + part of sites not found " + str(n_not_in_compare) + "(" + str(part_not_in_compare) + ")" )
sys.stderr.write("Checking that number of called genotypes add up to number of records. This number should be one: " + str(check_num_compare)+ "\n" )
sys.stderr.write("Checking that parts of called genotypes add up to number of records. This number should be very close to one: " + str(float(check_parts_compare))+ "\n")
print("sample,n_not_called,part_not_called,n_non_matching_het,part_non_matching_het,n_matching_het,part_het,n_hom_ref,part_hom_ref,n_hom_alt,part_hom_alt")
print(sample_name+","+str(n_not_called)+","+str(part_not_called)+","+str(n_non_matching_het)+","+str(part_non_matching_het)+","+str(n_matching_het)+","+str(part_het)+","+str(n_hom_ref)+","+str(part_hom_ref)+","+str(n_hom_alt)+","+str(part_hom_alt))