## load used barcode identifiers
bcNames <- read.table("barcode_counts.txt", stringsAsFactors=FALSE)[,2];
## set gene annotation file location
annotationFile <- "ensembl_GRCm39_geneFeatureLocations.txt.gz";
## load ensemble transcript metadata (including gene name)
ensembl.df <- read_delim(annotationFile, delim="\t");
c("Transcript stable ID" = "transcript",
"Gene description" = "Description",
"Gene start (bp)" = "Start",
"Chromosome/scaffold name" = "Chr")[colnames(ensembl.df)];
ensembl.df$Description <- sub(" \\[.*$","",ensembl.df$Description);
ensembl.df$Description <- sub("^(.{50}).+$","\\1...",ensembl.df$Description);
options(scipen=15); ## don't show scientific notation for large positions
for(mapper in c("LAST")){
checkName <- sprintf("mapped/trnCounts_%s_%s_vs_Mmus_transcriptome.txt.gz",
if(!file.exists(checkName)){
cat(sprintf("Warning: %s does not exist. If you are using mapper '%s', consider this an error\n", checkName, mapper));
## load count data into "narrow" array (one line per count)
sprintf("mapped/trnCounts_%s_%s_vs_Mmus_transcriptome.txt.gz",
read_table2(col_names=c("count","barcode",
## remove revision number from transcript names (if present)
trn.counts$transcript <- sub("\\.[0-9]+$","",trn.counts$transcript);
## convert to wide format (one line per transcript)
trn.counts.wide <- spread(trn.counts, barcode, count) %>%
mutate(dir = c("+"="fwd", "-"="rev")[dir]);
for(bd in colnames(trn.counts.wide[,-1])){
trn.counts.wide[[bd]] <- replace_na(trn.counts.wide[[bd]],0);
## merge ensembl metadata with transcript counts
gene.counts.wide <- inner_join(ensembl.df, trn.counts.wide, by="transcript");
gene.counts.wide <- gene.counts.wide[order(-rowSums(gene.counts.wide[,-(1:8)])),];
## write result out to a file
write.csv(gene.counts.wide,
file=sprintf("wide_transcript_counts_%s_%s.csv", mapper, Sys.Date()),