#Count UMis and reads per target sequences in alignment file (sam)
parser = argparse.ArgumentParser(description='')
parser.add_argument("-A", "--alignment", dest="aln_file" , type=str, help="QC paired end alignement file")
parser.add_argument("-O", "--output", dest="output_path" , type=str, help="output path")
parser.add_argument("-S", "--sample", dest="sample_name" , type=str, help="sample name")
args = parser.parse_args()
output_path = args.output_path
sample_name = args.sample_name
ref2umi = {} #keys : target sequences ; values : list of UMIs matching sequences
ref2reads = {} #keys : target sequences ; values : list of reads matching sequences
with open(aln_file, 'r') as leSam:
read_mapped = line.split('\t')[0]
corresponding_umi = read_mapped.split('_')[1]
reference_mapped = line.split('\t')[2]
if reference_mapped not in ref2umi:
ref2umi[reference_mapped] = [corresponding_umi]
ref2umi[reference_mapped].append(corresponding_umi)
if reference_mapped not in ref2reads:
ref2reads[reference_mapped] = [read_mapped]
ref2reads[reference_mapped].append(read_mapped)
#Output : tab separated file
#First column : target sequence
#Second column : number of aligned UMIs
#Third column : number of aligned reads
with open(output_path + "/" + sample_name + "_output.csv", 'w') as leOut:
leOut.write(">" + element + '\t' + str(len(set(ref2umi[element]))) + '\t' + str(len(ref2reads[element])) + '\n')