Skip to content

Commit ebc8f9a

Browse files
committed
Quality_Assessment now generates a combined png image of all FASTQC plots
Clarify error messages for Adapter_Trimming and Read_Mapping RTC and Indel_Realigner now support fixing quality scores that are too high Automatically check for .bai indexing in Haplotype_Caller, RTC, and Indel_Realigner
1 parent e82460c commit ebc8f9a

9 files changed

Lines changed: 117 additions & 47 deletions

Config

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/bin/bash
22

3+
# Config version 2.00
34
# More complete information on how to fill out
45
# this Config file can be found at:
56
# https://github.com/MorrellLAB/sequence_handling/wiki/Configuration
@@ -37,6 +38,11 @@ REF_GEN=
3738
# Choose from: "true" or "false"
3839
BARLEY=
3940

41+
# Do the quality scores need to be adjusted for GATK? Default: false
42+
# Change to true if you get errors from GATK like:
43+
# "<Sample> appears to be using the wrong encoding for quality scores: we encountered an extremely high quality score"
44+
FIX_QUALITY_SCORES=false
45+
4046
############################################
4147
########## GBS_Demultiplex ##########
4248
############################################
@@ -368,11 +374,6 @@ FINISHED_BAM_LIST=
368374
# For soybean: 0.001
369375
THETA=
370376

371-
# Do the quality scores need to be adjusted? Default: false
372-
# Change to true only if you get errors from GATK like:
373-
# "<Sample> appears to be using the wrong encoding for quality scores: we encountered an extremely high quality score"
374-
FIX_QUALITY_SCORES=false
375-
376377
############################################
377378
########## Genotype_GVCFs ##########
378379
############################################
@@ -401,9 +402,9 @@ REF_DICT=
401402
# For soybean: 20
402403
NUM_CHR=
403404

404-
# Optional: Include the full file path to a list of the names of any and all scaffolds or parts of the reference not covered by the chromosomes above
405-
# One scaffold name per line, must end in .intervals, leave blank if you wish to not call SNPs on non-chromosomal sequence
406-
# Samtools style intervals are also acceptable, one per line (ex: chr1:100-200)
405+
# Optional: Leave blank if you do not wish to call SNPs on non-chromosomal sequence
406+
# Include the full file path to a list of the names of any and all scaffolds or parts of the reference not covered by the chromosomes above
407+
# One scaffold name per line, must end in .intervals, SAMtools style intervals are also acceptable, one per line (ex: chr1:100-200)
407408
CUSTOM_INTERVALS=
408409

409410
# What is the sample ploidy? (integer value)
@@ -428,26 +429,26 @@ CHS_VCF_LIST=
428429
# If not exome capture, put "NA"
429430
CAPTURE_REGIONS=
430431

431-
# What is the depth per sample (DP) cutoff?
432+
# What is the depth per sample (DP) cutoff? (integer)
432433
# If a sample's DP is below this threshold, it will count as a "bad" sample for that site.
433434
# Recommended value: 5
434435
CHS_DP_PER_SAMPLE_CUTOFF=5
435436

436-
# What is the genotyping quality (GQ) cutoff?
437+
# What is the genotyping quality (GQ) cutoff? (integer)
437438
# If a sample's GQ is below this threshold, it will count as a "bad" sample for that site.
438439
# Recommended value: 10th percentile of the raw GQ percentile table
439440
CHS_GQ_CUTOFF=
440441

441-
# What is the maximum number of "bad" (low GQ, low DP, or missing genotype data) samples allowed at a site?
442+
# What is the maximum number of "bad" (low GQ, low DP, or missing genotype data) samples allowed at a site? (integer)
442443
# Sites with more "bad" samples than this threshold will be filtered out.
443444
# Recommended value: total number of samples * 0.2 (rounded to the nearest whole number)
444445
CHS_MAX_BAD=
445446

446-
# What is the maximum number of samples at a site that can be heterozygous?
447+
# What is the maximum number of samples at a site that can be heterozygous? (integer)
447448
# Recommended value: total number of samples * 0.9 (rounded to the nearest whole number)
448449
CHS_MAX_HET=
449450

450-
# What is the QUAL score cutoff?
451+
# What is the QUAL score cutoff? (integer)
451452
# Sites with a QUAL below this cutoff will be excluded.
452453
# Recommended value: 40
453454
CHS_QUAL_CUTOFF=40
@@ -513,39 +514,39 @@ VF_VCF=${OUT_DIR}/Variant_Recalibrator/${PROJECT}_recalibrated.vcf
513514
# If not exome capture, put "NA"
514515
VF_CAPTURE_REGIONS=
515516

516-
# What is the minimum number of reads needed to support a genotype?
517+
# What is the minimum number of reads needed to support a genotype? (integer)
517518
# Recommended value: 5
518519
MIN_DP=5
519520

520-
# What is the maximum number of reads allowed to support a genotype? (too many = gene duplication problems)
521+
# What is the maximum number of reads allowed to support a genotype? (too many = gene duplication problems) (integer)
521522
# Recommended value: 95th percentile of the raw DP per sample percentile table
522523
MAX_DP=
523524

524-
# What is the maximum percent deviation from 50/50 reference/alternative reads allowed in heterozygotes?
525-
# For example, MAX_DEV=0.1 allows 60/40 ref/alt and also 40/60 ref/alt but not 70/30 or 30/70 ref/alt reads
525+
# What is the maximum percent deviation from 50/50 reference/alternative reads allowed in heterozygotes? (decimal)
526+
# For example, MAX_DEV=0.1 allows 60/40 ref/alt and also 40/60 ref/alt but not anything more skewed
526527
# Recommended value: 0.1
527528
MAX_DEV=0.1
528529

529-
# What is the depth per sample (DP) cutoff?
530+
# What is the depth per sample (DP) cutoff? (integer)
530531
# If a sample's DP is below this threshold, it will count as a "bad" sample for that site.
531532
# Recommended value: 5
532533
DP_PER_SAMPLE_CUTOFF=5
533534

534-
# What is the genotyping quality (GQ) cutoff?
535+
# What is the genotyping quality (GQ) cutoff? (integer)
535536
# If a sample's GQ is below this threshold, it will count as a "bad" sample for that site.
536537
# Recommended value: 10th percentile of the raw GQ percentile table
537538
GQ_CUTOFF=
538539

539-
# What is the maximum number of "bad" (low GQ, low DP, or missing genotype data) samples allowed at a site?
540+
# What is the maximum number of "bad" (low GQ, low DP, or missing genotype data) samples allowed at a site? (integer)
540541
# Sites with more "bad" samples than this threshold will be filtered out.
541542
# Recommended value: total number of samples * 0.2 (rounded to the nearest whole number)
542543
MAX_BAD=
543544

544-
# What is the maximum number of samples at a site that can be heterozygous?
545+
# What is the maximum number of samples at a site that can be heterozygous? (integer)
545546
# Recommended value: total number of samples * 0.9 (rounded to the nearest whole number)
546547
MAX_HET=
547548

548-
# What is the QUAL score cutoff?
549+
# What is the QUAL score cutoff? (integer)
549550
# Sites with a QUAL below this cutoff will be excluded.
550551
# Recommended value: 40
551552
QUAL_CUTOFF=40
@@ -597,6 +598,11 @@ module load fastqc/0.11.5
597598
#FASTQC=
598599
#export PATH=${FASTQC}:${PATH}
599600

601+
# Do we have Riss-util installed?
602+
module load riss_util/1.0
603+
#RISS_UTIL=
604+
#export PATH=${RISS_UTIL}:${PATH}
605+
600606
# Do we have Seqqs installed?
601607
module load seqqs_ML/3d05375
602608
#SEQQS=

Handlers/Adapter_Trimming.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ function trimAdapters() {
5151
# Make a named for the trimmed sample
5252
local trimmed="${out}/${name}_ScytheTrimmed.fastq.gz"
5353
# Trim the sample
54-
scythe -a "${adapters}" -p "${prior}" -q "${platform}" --quiet "${toTrim}" | gzip -c > "${trimmed}"
54+
(set -x; scythe -a "${adapters}" -p "${prior}" -q "${platform}" --quiet "${toTrim}" | gzip -c > "${trimmed}")
5555
}
5656

5757
# Export the function

Handlers/Indel_Realigner.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ function Indel_Realigner() {
2121
local intervals_list="$6" # Where is the list of intervals files?
2222
local lod="$7" # What is the LOD threshold?
2323
local entropy="$8" # What is the entropy threshold?
24+
local qscores="$9" # Do we fix quality scores?
2425
declare -a intervals_array=($(grep -E ".intervals" "${intervals_list}")) # Put the intervals list into array format
2526
local current_intervals="${intervals_array[${PBS_ARRAYID}]}" # Get the intervals file for the current sample
2627
declare -a sample_array=($(grep -E ".bam" "${sample_list}")) # Put the sample list into array format
@@ -30,6 +31,8 @@ function Indel_Realigner() {
3031
mkdir -p "${out}"
3132
# Change into the output directory
3233
cd "${out}"
34+
if [[ "${qscores}" == true ]]
35+
then
3336
# Run GATK using the parameters given
3437
(set -x; java -Xmx"${memory}" -jar "${gatk}" \
3538
-T IndelRealigner \
@@ -38,7 +41,19 @@ function Indel_Realigner() {
3841
--LODThresholdForCleaning "${lod}" \
3942
--targetIntervals "${current_intervals}" \
4043
-I "${current}" \
44+
--fix_misencoded_quality_scores \
4145
-o "${out}/${name}_realigned.bam")
46+
else
47+
# Run GATK using the parameters given
48+
(set -x; java -Xmx"${memory}" -jar "${gatk}" \
49+
-T IndelRealigner \
50+
-R "${reference}" \
51+
--entropyThreshold "${entropy}" \
52+
--LODThresholdForCleaning "${lod}" \
53+
--targetIntervals "${current_intervals}" \
54+
-I "${current}" \
55+
-o "${out}/${name}_realigned.bam")
56+
fi
4257
}
4358

4459
# Export the function

Handlers/Quality_Assessment.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ function Quality_Assessment() {
7373
tail -n +2 "${out}/${project}_quality_summary_unfinished.txt" | sort >> "${out}/${project}_quality_summary.txt"
7474
# Remove the unsorted file
7575
rm "${out}/${project}_quality_summary_unfinished.txt"
76+
# Change into the Zip_Files directory
77+
cd "${out}/Zip_Files"
78+
# Combine all plots into one file
79+
fasterqc.pl -o "${out}/${project}_quality_plots.png"
80+
# Remove the other file generated by fasterqc.pl
81+
rm "${out}/Zip_Files/fasterqc.overrep.txt"
7682
}
7783

7884
export -f Quality_Assessment

Handlers/Read_Mapping.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ function Read_Mapping_Paired() {
7878
mkdir -p "${outDirectory}" # Make our outdirectory
7979
local memSettings=$(ParseBWASettings) # Assemble our settings for BWA mem
8080
local readGroupID=$(createReadGroupID "${sampleName}" "${project}" "${platform}") # Assemble our read group ID
81-
bwa mem "${memSettings}" -R "${readGroupID}" "${reference}" "${forwardSample}" "${reverseSample}" > "${outDirectory}"/"${sampleName}".sam # Read map our sample
81+
(set -x; bwa mem "${memSettings}" -v 2 -R "${readGroupID}" "${reference}" "${forwardSample}" "${reverseSample}" > "${outDirectory}"/"${sampleName}".sam)
8282
}
8383

8484
# Export the function
@@ -95,7 +95,7 @@ function Read_Mapping_Singles() {
9595
mkdir -p "${outDirectory}" # Make our outdirectory
9696
local memSettings=$(ParseBWASettings) # Assemble our settings for BWA mem
9797
local readGroupID=$(createReadGroupID "${sampleName}" "${project}" "${platform}") # Assemble our read group ID
98-
bwa mem "${memSettings}" -R "${readGroupID}" "${reference}" "${sampleFile}" > "${outDirectory}"/"${sampleName}".sam # Read map our sample
98+
(set -x; bwa mem "${memSettings}" -v 2 -R "${readGroupID}" "${reference}" "${sampleFile}" > "${outDirectory}"/"${sampleName}".sam)
9999
}
100100

101101
# Export the function

Handlers/Realigner_Target_Creator.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,31 @@ function Realigner_Target_Creator() {
1818
local gatk="$3" # Where is the GATK jar?
1919
local reference="$4" # Where is the reference sequence?
2020
local memory="$5" # How much memory can Java use?
21+
local qscores="$6" # Do we fix quality scores?
2122
declare -a sample_array=($(grep -E ".bam" "${sample_list}")) # Put the sample list into array format
2223
local current="${sample_array[${PBS_ARRAYID}]}" # Pull out one sample to work on
2324
local name=$(basename ${current} .bam) # Get the name of the sample without the extension
2425
# Make sure the out directory exists
2526
mkdir -p "${out}"
27+
if [[ "${qscores}" == true ]]
28+
then
2629
# Run GATK using the parameters given
2730
(set -x; java -Xmx"${memory}" -jar "${gatk}" \
2831
-T RealignerTargetCreator \
2932
-R "${reference}" \
3033
-nt 1 \
3134
-I "${current}" \
35+
--fix_misencoded_quality_scores \
3236
-o "${out}/${name}.intervals")
37+
else
38+
# Run GATK using the parameters given
39+
(set -x; java -Xmx"${memory}" -jar "${gatk}" \
40+
-T RealignerTargetCreator \
41+
-R "${reference}" \
42+
-nt 1 \
43+
-I "${current}" \
44+
-o "${out}/${name}.intervals")
45+
fi
3346
}
3447

3548
# Export the function

Handlers/SAM_Processing_Picard.sh

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,59 +36,65 @@ function SAM_Processing(){
3636
if [[ -z ${tmp} ]] # If tmp is left blank
3737
then
3838
# Sort the SAM file and convert to BAM
39-
(set -x; java -Xmx"${maxMem}" -jar ${picardJar} SortSam \
39+
java -Xmx"${maxMem}" -jar ${picardJar} SortSam \
4040
INPUT="${SAMFile}" \
4141
OUTPUT="${outDirectory}/Intermediates/Sorted/${sampleName}_sorted.bam" \
4242
SO="coordinate" \
43-
VALIDATION_STRINGENCY="SILENT")
43+
VERBOSITY="WARNING" \
44+
VALIDATION_STRINGENCY="SILENT"
4445
# Deduplicate the BAM file
45-
(set -x; java -Xmx"${maxMem}" -jar ${picardJar} MarkDuplicates \
46+
java -Xmx"${maxMem}" -jar ${picardJar} MarkDuplicates \
4647
INPUT="${outDirectory}/Intermediates/Sorted/${sampleName}_sorted.bam" \
4748
OUTPUT="${outDirectory}/Intermediates/Deduplicated/${sampleName}_deduped.bam" \
4849
METRICS_FILE="${outDirectory}/Statistics/Deduplicated_BAM_Stats/${sampleName}_deduplicated.txt" \
4950
REMOVE_DUPLICATES="true" \
5051
ASSUME_SORTED="true" \
51-
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=${maxFiles})
52+
VERBOSITY="WARNING" \
53+
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=${maxFiles}
5254
# Add read group information to the BAM file
53-
(set -x; java -Xmx"${maxMem}" -jar ${picardJar} AddOrReplaceReadGroups \
55+
java -Xmx"${maxMem}" -jar ${picardJar} AddOrReplaceReadGroups \
5456
INPUT="${outDirectory}/Intermediates/Deduplicated/${sampleName}_deduped.bam" \
5557
OUTPUT="${outDirectory}/${sampleName}.bam" \
5658
RGID="${sampleName}" \
5759
RGLB="${sampleName}" \
5860
RGPL="${platform}" \
5961
RGPU="${sampleName}" \
60-
RGSM="${sampleName}")
62+
VERBOSITY="WARNING" \
63+
RGSM="${sampleName}"
6164
else # If a tmp is provided
6265
# Make sure tmp exists
6366
mkdir -p ${tmp}
6467
# Make sure we have write permissions for tmp
6568
if ! [[ -w "${tmp}" ]]; then echo "You do not have write permissions for ${tmp}, exiting..." >&2; exit 1; fi
6669
# Sort the SAM file and convert to BAM
67-
(set -x; java -Xmx"${maxMem}" -jar ${picardJar} SortSam \
70+
java -Xmx"${maxMem}" -jar ${picardJar} SortSam \
6871
INPUT="${SAMFile}" \
6972
OUTPUT="${outDirectory}/Intermediates/Sorted/${sampleName}_sorted.bam" \
7073
SO="coordinate" \
7174
VALIDATION_STRINGENCY="SILENT" \
72-
TMP_DIR="${tmp}")
75+
VERBOSITY="WARNING" \
76+
TMP_DIR="${tmp}"
7377
# Deduplicate the BAM file
74-
(set -x; java -Xmx"${maxMem}" -jar ${picardJar} MarkDuplicates \
78+
java -Xmx"${maxMem}" -jar ${picardJar} MarkDuplicates \
7579
INPUT="${outDirectory}/Intermediates/Sorted/${sampleName}_sorted.bam" \
7680
OUTPUT="${outDirectory}/Intermediates/Deduplicated/${sampleName}_deduped.bam" \
7781
METRICS_FILE="${outDirectory}/Statistics/Deduplicated_BAM_Stats/${sampleName}_deduplicated.txt" \
7882
REMOVE_DUPLICATES="true" \
7983
ASSUME_SORTED="true" \
8084
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=${maxFiles} \
81-
TMP_DIR="${tmp}")
85+
VERBOSITY="WARNING" \
86+
TMP_DIR="${tmp}"
8287
# Add read group information to the BAM file
83-
(set -x; java -Xmx"${maxMem}" -jar ${picardJar} AddOrReplaceReadGroups \
88+
java -Xmx"${maxMem}" -jar ${picardJar} AddOrReplaceReadGroups \
8489
INPUT="${outDirectory}/Intermediates/Deduplicated/${sampleName}_deduped.bam" \
8590
OUTPUT="${outDirectory}/${sampleName}.bam" \
8691
RGID="${sampleName}" \
8792
RGLB="${sampleName}" \
8893
RGPL="${platform}" \
8994
RGPU="${sampleName}" \
9095
RGSM="${sampleName}" \
91-
TMP_DIR="${tmp}")
96+
VERBOSITY="WARNING" \
97+
TMP_DIR="${tmp}"
9298
fi
9399
# Generate metrics on the finished BAM file
94100
samtools flagstat "${outDirectory}/${sampleName}.bam" > "${outDirectory}/Statistics/Finished_BAM_Stats/${sampleName}_finished.txt"

HelperScripts/utils.sh

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# Check to make sure our samples exist
44
function checkSamples() {
5-
local sample_list="$1" # Sample ist
5+
local sample_list="$1" # Sample list
66
if [[ -f "${sample_list}" ]] # If the sample list exists
77
then
88
if [[ -r "${sample_list}" ]] # If the sample list is readable
@@ -11,22 +11,22 @@ function checkSamples() {
1111
do
1212
if ! [[ -f "${sample}" ]] # If the sample doesn't exist
1313
then
14-
echo "${sample} does not exist, exiting..." >&2 # Exit out with error
14+
echo "The sample ${sample} does not exist, exiting..." >&2 # Exit out with error
1515
return 1
1616
else
1717
if ! [[ -r "${sample}" ]] # If the sample isn't readable
1818
then
19-
echo "${sample} does not have read permissions, exiting..." >&2
19+
echo "The sample ${sample} does not have read permissions, exiting..." >&2
2020
return 1
2121
fi
2222
fi
2323
done
2424
else # If the sample isn't readable
25-
echo "${sample_list} does not have read permissions, exiting..." >&2
25+
echo "The sample list ${sample_list} does not have read permissions, exiting..." >&2
2626
return 1
2727
fi
2828
else # If the sample list doesn't exist
29-
echo "$sample_list does not exist, exiting..." >&2 # Exit out with error
29+
echo "The sample list $sample_list does not exist, exiting..." >&2 # Exit out with error
3030
return 1
3131
fi
3232
}
@@ -149,4 +149,22 @@ function createDict() {
149149
}
150150

151151
# Export the function
152-
export -f createDict
152+
export -f createDict
153+
154+
# Check to make sure our BAM files are indexed
155+
function checkBaiIndex() {
156+
local sample_list="$1" # Sample list of BAM files, already checked by checkSamples (above)
157+
for sample in $(cat "${sample_list}") # For each sample in the sample list
158+
do
159+
local basename=$(basename "${sample}" .bam)
160+
local dirname=$(dirname "${sample}")
161+
if [[ ! -f "${dirname}/${basename}.bai" && ! -f "${dirname}/${basename}.bam.bai" ]] # If the sample doesn't have a .bai index file of either naming convention
162+
then
163+
echo "The sample ${sample} does not have a .bai index, exiting..." >&2
164+
exit 32
165+
fi
166+
done
167+
}
168+
169+
# Export the function
170+
export -f checkBaiIndex

0 commit comments

Comments
 (0)