##by Krisna, M & Monteith, W 2023
df_p2 = pd.read_csv('/path/to/input_file.csv')
core_gene_list_p2 = df_p2['locus_id'].to_list()
os.chdir('/path/to/selected/modified/gff/files')
for locus in core_gene_list_p2:
for filename in files_p2:
with open(filename, 'r') as f:
if locus not in result_p2:
if len(result_p2) != len(core_gene_list_p2):
for locus in core_gene_list_p2:
if locus not in result_p2:
print(locus, 'not found in gffs')
for locus in core_gene_list_p2:
result = result_p2[locus]
result = result.split('\t')
start_loc_p2.append(result[3])
end_loc_p2.append(result[4])
split_data = data.split('prev_locus=',1)[1]
split_data1 = data.split(';')
prev_locus.append(split_data1[0])
pirate_output = pd.DataFrame (
'gene_family' : df_p2['gene_family'],
'locus_id' : df_p2['locus_id'],
'isolate_id' : df_p2['isolate_id'],
'start_loc' : start_loc_p2,
'prokka_locus_name' : prev_locus
pirate_output.to_csv('/path/to/output_file.csv')