OREO script

Plain Text icon OREO.txt — Plain Text, 16 kB (16722 bytes)

File contents

#!/bin/bash

#This simple script was written in the Bash unix shell and command language to be used on the command line. It also uses standard awk for calculations with tabulated data. To try it download the file and place it in the same folder with your files of interest. Then make OREO.sh executable and start it by typing  ./OREO.sh  

#startline
	
	echo "  ___  ____  _____ ___  "
	echo " / _ \|  _ \| ____/ _ \ "
	echo "| | | | |_) |  _|| | | |"
	echo "| |_| |  _ <| |__| |_| |"
	echo " \___/|_| \_\_____\___/ "
	echo " "
	echo "--------------------------------------"
	echo "OREO data processing tool v1.0.0, 2018"
	echo " "
	echo "University of Leicester, UK"
	echo " "
	echo "It is best practice to quality control your data prior to calling variants."
	echo "QC, as a minimum, should involve removing any leftover adapters, including adapter read-through"
	echo "and the removal of low-quality bases from the reads."
	echo " "

#asking users if their data is QC-ed, and directing action accordingly:

	read -p "	Have you performed QC on your files (Y/N)?:" qc

case $qc in

   [yY] | [yY][Ee][Ss] )

	echo " "
	echo "OREO processes QC-ed FASTQ files."
	echo " "
	;;
       
   * )  
	echo " "   
	echo "Have a biscuit and when you QC-ed your FASTQ files try again and type yes."
	exit;;
esac

	echo "OREO - Overarching Read Enrichment Option"
	echo " "	
	echo "OREO was written for assessing mtDNA heteroplasmy calls from massively parallel sequencing data from overlapping amplicons."
	echo "OREO improves the accuracy of quantification of Alternative variant % over putative primer sites."
	echo "Its use can be easily adapted to other templates for studies that use overlapping amplicons with MPS, where detection "
	echo "of variants can benefit from bypassing the interference from proprietary primers, which cannot otherwise be directly clipped"
	echo "or trimmed from the reads."
	echo " "

#asking users if their data is PE or SE:

	read -p "	Enter your data type (PE for paired-end or SE for single-end data):" vartype

case $vartype in
    PE ) 

	echo " "
	echo "Running OREO for paired-end (PE) data."
	echo "For processing, input files are expected to be in a name format, where the samplename followed by the read# and the"
	echo "extension, and are separated by '.' (e.g. sample01.R1.fastq and sample01.R2.fastq)."
	echo " "
	read -p "	What is your extension format following the '.' in your filenames after R# (e.g. fastq): " extform
	echo " "


#prompted user input lines asking for details of the variant to be quantified and the samples to look at:
	
	read -p "	Enter the position with the variant to be quantified (e.g.16153): " pos
	echo " "
	read -p "	Enter the reference allele at this position (e.g. G): " ref
	echo " "
	read -p "	Enter the alternative allele at this position (e.g. A): " alt
	echo " "
	read -p "	Enter any other nearby variant included in your probes (e.g. none, 16158G): " other
	echo " "
	echo "Probe sequences are to be designed on a case-by-case basis to capture the only reads at the site of interest, which are"
	echo "overarching the amplicon ends therefore spanning the likely primer sites and thus bypassing reads with sequences derived"
	echo "from primers."
	echo " "
	echo "When designing your own probes find amplicon ends (take hints from coverage tracks) and pick short but specific sequences"
	echo "which bait for reads that go beyond the amplicon end defined by primers."
	echo "Only these overarching reads satisfy the probes and only these are retained for calculating accurate variant levels."
	echo " "
	echo "Design the probes for both the reference and the alternative allele(s) and for their reverse complement as well."
	echo "Be aware that manufacturers may change their primers slightly, therefore verify using coverage tracks that the positions of"
	echo "your earlier probes are still relevant to capture overarching reads in your current experiment."
	echo " "
	read -p "	For Reference allele Forward direction - Enter your own probe or copy it from table of probes provided (e.g. 	GACCACCTGTAGTAC ):" probeREFFOR
	echo " "
	read -p "	For Alternative allele Forward direction - Enter your own probe or copy it from table of probes provided (e.g. 	GACCACCTATAGTAC ):" probeALTFOR
	echo " "
	read -p "	For Reference allele Reverse direction - Enter your own probe or copy it from table of probes provided (e.g. 	GTACTACAGGTGGTC ):" probeREFREV
	echo " "
	read -p "	For Alternative allele Reverse direction - Enter your own probe or copy it from table of probes provided (e.g. 	GTACTATAGGTGGTC ):" probeALTREV
	echo " "

array=()
while IFS= read -r -p "	Enter one sample name at a time (e.g. sample01), finish input with an empty line): " sample
	do [[ $sample ]] || break  
    		array+=("$sample")
	done

printf '%s\n' "Samples to process:"
printf '%s\n' "${array[@]}"

for sn in "${array[@]}"
do

##optionally, for batch processing and/or to define these variables and the QC-ed FASTQ files directly in the script, rather than a prompted user input, activate these lines and edit accordingly (enter variables, also adjust the sample name structure as it is set here to the sample##.R#.fastq format):
##pos=16153
##ref=G
##alt=A
##other=none
##probeREFFOR=GACCACCTGTAGTAC
##probeALTFOR=GACCACCTATAGTAC
##probeREFREV=GTACTACAGGTGGTC
##probeALTREV=GTACTATAGGTGGTC
##for file in `ls *.R1.fastq | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/ /g'`
##do
##sn=`ls $file | grep -o sample[0123456789][0123456789]`
##extform= `ls $file | cut -d. -f3 `


#check if R1 and R2 files are present for PE files, and direct action accordingly:

if [ -e $sn.R1.$extform ]
then
	echo " "
    	echo "OREO found R1 file for $sn : $sn.R1.$extform "

	if [ -e $sn.R2.$extform ]
		then
    		echo "OREO found R2 file for $sn : $sn.R2.$extform "


#when both files are found the following steps are followed:


	#generating the summary file heading for the sample:

	echo "Summary" > $sn.PE.$pos.count

	echo "	$sn at position $pos" >> $sn.PE.$pos.count

	echo "Reference and Alternative allele specific probe counts in $vartype QC-ed FASTQ files:" >> $sn.PE.$pos.count

	echo "	$sn.R1.$extform and $sn.R2.$extform " >> $sn.PE.$pos.count

	echo "Other variants present in probe: $other" >> $sn.PE.$pos.count

	echo "At position $pos the reference allele is $ref and the alternative allele is $alt." >> $sn.PE.$pos.count

	echo "Probes entered to be used by OREO:" >> $sn.PE.$pos.count

	echo " 	for reference allele (forward/reverse)	 	$probeREFFOR / $probeREFREV " >> $sn.PE.$pos.count

	echo " 	for alternative allele (forward/reverse)	$probeALTFOR / $probeALTREV " >> $sn.PE.$pos.count

	echo "Reads counted by these probes:" >> $sn.PE.$pos.count

	echo "______________________________" >> $sn.PE.$pos.count

	echo " " >> $sn.PE.$pos.count

	echo "read	reference_F	reference_R	alternative_F	alternative_R" >> $sn.PE.$pos.count


	#counting the reads captured by the defined set of probes in the appropriate files for the sample in an interim file:

	echo "in_R1:" | tr "\n" " " > $sn.$pos.int

	grep -c $probeREFFOR $sn.R1.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeREFREV $sn.R1.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeALTFOR $sn.R1.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeALTREV $sn.R1.$extform >> $sn.$pos.int

	echo "in_R2:" | tr "\n" " " >> $sn.$pos.int

	grep -c $probeREFFOR $sn.R2.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeREFREV $sn.R2.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeALTFOR $sn.R2.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeALTREV $sn.R2.$extform >> $sn.$pos.int


	# calculating sum of readpairs in both R1 and R2 in the interim file:
	#Be aware that the sum of R1 and R2 is not to be taken at face value, as technically each read should be registered once in each R1 and R2. However how well balanced one would see these representedin both, depends on the position of the variant within the readplus the length of the amplicon.Variants closer to the 5' end of the amplicon would be more represented in R1 and less represented in R2 and vice versa. The imbalance is coming from the dropping quality along the length of the reads. To factor this in for all alleles the R1 and R2 counts are summed before the Alternative variant % is calculated. 

	echo "in_R1+R2:" | tr "\n" " " >> $sn.$pos.int 

	awk ' {sum2+=$2; sum3+=$3; sum4+=$4; sum5+=$5} END {print sum2,sum3,sum4,sum5}' $sn.$pos.int >> $sn.$pos.int


	# calculating ALT% in the interim file:

	echo "Alternative_variant_%:" | tr "\n" " " >> $sn.$pos.int

	awk '{NR=3; all+=$2+$3+$4+$5; alt+=$4+$5} END {print alt/all*100}' $sn.$pos.int  >> $sn.$pos.int


	#completing the summary file:

	cat $sn.$pos.int | tr ' ' '\t'| tr '_' ' ' >> $sn.PE.$pos.count

	echo "______________________________" >> $sn.PE.$pos.count


	#cleaning up the interim file:

	rm $sn.$pos.int

#feedback to print to screen

	if [ -e $sn.PE.$pos.count ]
	then
    		echo "OREO calculated Alternative variant % for $sn. Results are in the file:  $sn.PE.$pos.count "
	else
		echo " "
		echo "OREO failed to calculate Alternative variant % for $sn. Chosen option is for PE files, have a biscuit and for SE files enter data type as SE."
	fi 


#when either or both input files are missing the steps above are not completed, instead the following is printed on the screen:

		else
			echo " "
    			echo "OREO failed to find R2, the pair of your R1 input file for $sn. Have a biscuit and check if both of your files are in a '$sn.R#.$extform' name format."
		fi 


else
	echo " "
	echo "OREO failed to find R1 PE input file for $sn."

	if [ -e $sn.R2.$extform ]
	then
    		echo "However, OREO found R2 PE input file for $sn. Have a biscuit and check if both of your files for $sn are in a $sn.R#.$extform name format."
	else
    		echo "OREO failed to find R2 PE input file for $sn. Have a biscuit and check if both of your files for $sn are in a $sn.R#.$extform name format."
	fi 
fi

done 
echo " "
echo "Thank you for using OREO!"
	exit;;

   SE )
	echo " "
	echo "Running OREO for single-end (SE) data."
	echo "For processing, input files are expected to be in a name format, where the samplename is followed by an extension,"
	echo "separated by a '.'(e.g. sample01.fastq )."
	echo " "
	read -p "	What is your extension format in your filenames following the '.' after your samplename?' (e.g. fastq): " extform
	echo " "
	;;
       
   * )     
	echo " "
	echo "Have a biscuit, then try again and enter SE or PE to run the scripts."
	echo " "
	exit;;
esac

#prompted user input lines asking for details to be quantified:
	
	read -p "	Enter the position with the variant to be quantified (e.g.16153): " pos
	echo " "
	read -p "	Enter the reference allele at this position (e.g. G): " ref
	echo " "
	read -p "	Enter the alternative allele at this position (e.g. A): " alt
	echo " "
	read -p "	Enter any other variant in your probes (e.g. none, 16158G): " other
	echo " "
	echo "Probe sequences are to be designed on a case-by-case basis to capture the only reads at the site of interest, which are"
	echo "overarching the amplicon ends therefore spanning the likely primer sites and thus bypassing reads with sequences derived"
	echo "from primers."
	echo " "
	echo "When designing your own probes find amplicon ends (take hints from coverage tracks) and pick short but specific sequences"
	echo "which bait for reads that go beyond the amplicon end defined by primers."
	echo "Only these overarching reads satisfy the probes and only these are retained for calculating accurate variant levels."
	echo " "
	echo "Design the probes for both the reference and the alternative allele(s) and for their reverse complement as well."
	echo "Be aware that manufacturers may change their primers slightly, therefore verify using coverage tracks that the positions of"
	echo "your earlier probes are still relevant to capture overarching reads in your current experiment."
	echo " "
	read -p "	For Reference allele Forward direction - Enter your own probe or copy it from table of probes provided (e.g. 	GACCACCTGTAGTAC ):" probeREFFOR
	echo " "
	read -p "	For Alternative allele Forward direction - Enter your own probe or copy it from table of probes provided (e.g. 	GACCACCTATAGTAC ):" probeALTFOR
	echo " "
	read -p "	For Reference allele Reverse direction - Enter your own probe or copy it from table of probes provided (e.g. 	GTACTACAGGTGGTC ):" probeREFREV
	echo " "
	read -p "	For Alternative allele Reverse direction - Enter your own probe or copy it from table of probes provided (e.g. 	GTACTATAGGTGGTC ):" probeALTREV
	echo " "

array=()
while IFS= read -r -p "	Enter one sample name at a time (e.g. sample01), finish input with an empty line): " sample
	do [[ $sample ]] || break  
    		array+=("$sample")
	done

printf '%s\n' "Samples to process:"
printf '%s\n' "${array[@]}"

for sn in "${array[@]}"
do

##optionally, for batch processing and/or to define these variables and the QC-ed FASTQ files directly in the script, rather than a prompted user input, activate these lines and edit accordingly (enter variables, also adjust the sample name structure as it is set here to the sample##.fastq format):
##pos=16153
##ref=G
##alt=A
##other=none
##probeREFFOR=GACCACCTGTAGTAC
##probeALTFOR=GACCACCTATAGTAC
##probeREFREV=GTACTACAGGTGGTC
##probeALTREV=GTACTATAGGTGGTC
##for file in `ls *.fastq | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/ /g'`
##do
##sn=`ls $file | grep -o sample[0123456789][0123456789]`
##extform= `ls $file | cut -d. -f3 `


#check if the files are in the 'sample##.fastq' name format for processing, and direct action accordingly:

if [ -e $sn.$extform ]
then
	echo " "   	
	echo "OREO found input file for $sn : $sn.$extform "

#when file is found the following steps are followed:

	#generating the summary file heading for the sample:

	echo "Summary" > $sn.SE.$pos.count

	echo "	$sn at position $pos" >> $sn.SE.$pos.count

	echo "Reference and Alternative allele specific probe counts in $vartype QC-ed FASTQ file:" >> $sn.SE.$pos.count

	echo "	$sn.$extform" >> $sn.SE.$pos.count

	echo "Other variants present in probe: $other " >> $sn.SE.$pos.count

	echo "At position $pos the reference allele is $ref and the alternative allele is $alt." >> $sn.SE.$pos.count

	echo "Probes entered to be used by OREO:" >> $sn.SE.$pos.count

	echo " 	for reference allele (forward/reverse) 		$probeREFFOR / $probeREFREV " >> $sn.SE.$pos.count

	echo " 	for alternative allele (forward/reverse) 	$probeALTFOR / $probeALTREV " >> $sn.SE.$pos.count

	echo "Reads counted by these probes: " >> $sn.SE.$pos.count

	echo "______________________________" >> $sn.SE.$pos.count

	echo " " >> $sn.SE.$pos.count

	echo "read	reference_F	reference_R	alternative_F	alternative_R" >> $sn.SE.$pos.count


	#counting the reads captured by the defined set of probes in the appropriate files for the sample in an interim file:

	echo "in_reads:" | tr "\n" " " > $sn.$pos.int

	grep -c $probeREFFOR $sn.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeREFREV $sn.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeALTFOR $sn.$extform | tr "\n" " " >> $sn.$pos.int

	grep -c $probeALTREV $sn.$extform >> $sn.$pos.int


	# calculating ALT% in the interim file:

	echo "Alternative_variant_%:" | tr "\n" " " >> $sn.$pos.int

	awk '{NR=1; all+=$2+$3+$4+$5; alt+=$4+$5} END {print alt/all*100}' $sn.$pos.int  >> $sn.$pos.int


	#completing the summary file:

	cat $sn.$pos.int | tr ' ' '\t'| tr '_' ' ' >> $sn.SE.$pos.count

	echo "______________________________" >> $sn.SE.$pos.count


	#cleaning up the interim file:

	rm $sn.$pos.int

#feedback to print to screen

		if [ -e $sn.SE.$pos.count ]
		then
    			echo "OREO calculated Alternative variant % for $sn. Results are in the file: $sn.SE.$pos.count "
	
		else
			echo " "
			echo "OREO failed to calculate Alternative variant % for $sn. Chosen option is for SE files, have a biscuit and for PE files enter data type as PE."
		fi 

#when the SE input file is missing the steps above are not completed, instead the following is printed on the screen:

else
	echo " "
    	echo "OREO failed to find the SE input file for $sn to process. Have a biscuit and check if your files are in a '$sn.$extform' name format."
fi 

done 
echo " "
echo "Thank you for using OREO!"

Share this page: