forked from MorrellLAB/sequence_handling
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSAM_Processing_SAMTools.sh
More file actions
117 lines (101 loc) · 4.66 KB
/
SAM_Processing_SAMTools.sh
File metadata and controls
117 lines (101 loc) · 4.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/bin/bash
#PBS -l mem=12gb,nodes=1:ppn=8,walltime=24:00:00
#PBS -m abe
#PBS -M user@example.com
#PBS -q lab
set -e
set -u
set -o pipefail
module load parallel
# This script is a QSub submission script for converting a batch of SAM files to BAM files
# To use, on line 5, change the 'user@example.com' to your own email address
# to get notificaitons on start and completion of this script
# Define a path to SAMTools in the 'SAMTOOLS' field on line 44
# If using MSI, leave the definition as is to use their installation of SAMTools
# Otherwise, it should look like this:
# SAMTOOLS=${HOME}/software/samtools
# Please be sure to comment out (put a '#' symbol in front of) the 'module load samtools' on line 43
# And to uncomment (remove the '#' symbol) from lines 44 and 45
# Add the full file path to list of samples on the 'SAMPLE_INFO' field on line 48
# This should look like:
# SAMPLE_INFO=${HOME}/Directory/list.txt
# Use ${HOME}, as it is a link that the shell understands as your home directory
# and the rest is the full path to the actual list of samples
# Define a path to a reference genome on line 51
# This should look like:
# REF_GEN=${HOME}/Directory/reference_genome.fa
# Put the full directory path for the output in the 'SCRATCH' field on line 54
# This should look like:
# SCRATCH="${HOME}/Out_Directory"
# Adjust for your own out directory.
# Name the project in the 'PROJECT' field on line 57
# This should look lke:
# PROJECT=Barley
# Run this script with the qsub command
# qsub SAM_Processing_SAMTools.sh
# This script outputs sorted and deduplicated BAM files for each sample
# Load the SAMTools module for MSI, else define path to SAMTools installation
module load samtools
#SAMTOOLS=
#export PATH=$PATH:${SAMTOOLS}
# List of SAM files for conversion
SAMPLE_INFO=
# Reference genome to help base the conversion off of
REF_GEN=
# Scratch directory, for output
SCRATCH=
# Name of project
PROJECT=
# Make the outdirectories
OUT=${SCRATCH}/${PROJECT}/SAM_Processing
mkdir -p ${OUT}/stats ${OUT}/deduped ${OUT}/sorted ${OUT}/raw ${OUT}/fixed_header
# Check to see if SAMTools is installed
if `command -v samtools > /dev/null 2> /dev/null`
then
echo "SAMTools is installed"
else
echo "Please install SAMTools and place in your path"
exit 1
fi
# Define a function to process SAM files generated by BWA mem
# This will create the following files for each input SAM file:
# a sorted BAM file
# alignment statistics for the sorted BAM file
# a deduplicated BAM file
# alignment statistics for the deduplicated BAM file
function process_sam() {
# Today's date
YMD=`date +%Y-%m-%d`
SAMFILE="$1"
REF_SEQ="$2"
OUTDIR="$3"
# Sample name, taken from full name of SAM file
SAMPLE_NAME=`basename "${SAMFILE}" .sam`
# Remove unnecessary information from @PG line
# Could use sed's in-place option, but that fails on some systems
# This method bypasses that
sed 's/-R.*$//' "${SAMFILE}" > "${OUTDIR}"/fixed_header/"${SAMPLE_NAME}"_FixedHeader.sam
# Generate a sorted BAM file
samtools view -bhT "${REF_SEQ}" "${OUTDIR}"/fixed_header/"${SAMPLE_NAME}"_FixedHeader.sam > "${OUTDIR}/raw/${SAMPLE_NAME}_${YMD}_raw.bam"
# Create alignment statistics for the raw BAM file
samtools flagstat "${OUTDIR}/raw/${SAMPLE_NAME}_${YMD}_raw.bam" > "${OUTDIR}/stats/${SAMPLE_NAME}_${YMD}_raw_stats.out"
# Sort the raw BAM file
samtools sort "${OUTDIR}/raw/${SAMPLE_NAME}_${YMD}_raw.bam" "${OUTDIR}/sorted/${SAMPLE_NAME}_${YMD}_sorted"
# Deduplicate the sorted BAM file
samtools rmdup "${OUTDIR}/sorted/${SAMPLE_NAME}_${YMD}_sorted.bam" "${OUTDIR}/deduped/${SAMPLE_NAME}_${YMD}_deduped.bam"
# Create alignment statistics using SAMTools
samtools flagstat "${OUTDIR}/deduped/${SAMPLE_NAME}_${YMD}_deduped.bam" > "${OUTDIR}/stats/${SAMPLE_NAME}_${YMD}_deduped_stats.out"
}
# Export the SAM file processing function to be used by GNU Parallel
export -f process_sam
# Run the SAM file processing function in parallel for all input SAM files
cat ${SAMPLE_INFO} | parallel "process_sam {} ${REF_GEN} ${OUT}"
# Add read groups and merge the BAM files
samtools merge -r ${OUT}/${PROJECT}_Merged.bam `find "${OUT}"/deduped -name "*_deduped.bam"`
# Create a list of deduped BAM files
find ${OUT}/deduped -name "*_deduped.bam" | sort > ${OUT}/${PROJECT}_finished_BAM_files.txt
echo "List of BAM files can be found at"
echo "${OUT}/${PROJECT}_finished_BAM_files.txt"
echo
echo "Merged BAM file can be found at"
echo "${OUT}/${PROJECT}_Merged.bam"