scripts_histone_marks/map_RNAseq_data.sh at master · simras/scripts_histone_marks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/bin/bash
#
# Automatic ChIP-seq mapping pipeline
#

set -e
set -x

# Bedgraph folder
bedgraphs="."
mapped="."
logs="."

# data folder
data="."

#TAIR10
baseGDIR="/home/simras/Work/histone_mark_data/genome"
genome="TAIR10.fa"

#Download_Folder
downloads="."
STAR="/home/simras/Work/STAR/bin/Linux_x86_64/STAR"

# path of MACS binary
MACS=/home/simras/Work/MACS-1.4.2/bin/macs14
bam=Aligned.out.bam
adapt="AGATCGGAAGAGC"

i=0
j=0
mapped_b=0

for SRR in $(cat SRRIDs.txt)
do
    tmp=${SRR%.fastq.gz}
    tmp1=${tmp%_*}
    SRR_ID=${tmp1##*/}

    # map data to assembly 1
    dat1=$downloads/$SRR_ID".fastq.gz"
    dat2=$downloads/$SRR_ID"_2.fastq.gz"

    if [ -f $dat2 ]
    then
	PAIRED="T"
	dat1=$downloads/$SRR_ID"_1.fastq.gz"
    else
	adapt=$(./pick_adapter.sh $dat1)

	PAIRED="F"
	dat2=""
    fi

    new_bam=$mapped/$SRR_ID\_$genome.bam
    sorted_bam=$mapped/$SRR_ID\_$genome"_"sorted.bam
    if [ ! -f $sorted_bam ]
    then
	mapped_b=1
	if [ $PAIRED == "T" ]
	then
	    ad1=$(./pick_adapter.sh $data/$SRR_ID\_1.fastq.gz)
	    ad2=$(./pick_adapter.sh $data/$SRR_ID\_2.fastq.gz)
	    datno1=$downloads/$SRR_ID"_noadt_1.fastq.gz"
	    datno2=$downloads/$SRR_ID"_noadt_2.fastq.gz"
	    #cutadapt -a $ad1 -A $ad2 -o $datno1 -p $datno2 $dat1 $dat2
	    if [ ! -f $datno2 ]
	    then
		cutadapt -a $ad2 -j 4 -o $datno2 $dat2
	    fi
	    if [ ! -f $datno1 ]
	    then
		cutadapt -a $ad1 -j 4 -o $datno1 $dat1
	    fi
	    $STAR --genomeDir $baseGDIR  --readFilesIn $datno1 $datno2 --runThreadN 8 --outSAMtype BAM Unsorted --readFilesCommand "zcat" --seedSearchStartLmax 30 --alignEndsType EndToEnd --alignIntronMax 1 --outSAMmultNmax 1

	else
	    $STAR --genomeDir $baseGDIR  --readFilesIn $dat1 --runThreadN 8 --outSAMtype BAM Unsorted --clip3pAdapterSeq $adapt --readFilesCommand "zcat" --seedSearchStartLmax 30 --alignEndsType EndToEnd --alignIntronMax 1 --outSAMmultNmax 1
	fi

	if [ $? -eq 0 ]
	then
	    new_bam=$mapped/$SRR_ID\_$genome.bam
	    mv $bam $new_bam
	    mv Log.final.out $logs/$SRR_ID\_Log.final_2.out
	    mv Log.out $logs/$SRR_ID\_Log_2.out
	    mv Log.progress.out $logs/$SRR_ID\_Log.progress_2.out

	    samtools sort $new_bam > $sorted_bam
	    rm $new_bam
	    if [ $? -ne 0 ]
	    then
		echo "samtools sort error" $SRR_ID > errors/$SRR_ID\_error.report
	    fi
	else
	    echo "Mapping error " $SRR_ID > errors/$SRR_ID\_error.report
	fi
    fi
    if [ -f $sorted_bam ] && [ ! -f $bedgraphs/$SRR_ID"_treat_afterfiting_all.bdg.gz" ]
    then
	#cd mapped
	if [ $PAIRED == "T" ]
	then
	    touch $bedgraphs/$SRR_ID"_treat_afterfiting_all.bdg.gz"
	    sorted_bam_PE=$mapped/$SRR_ID\_$genome"_"sorted_PE.bam
	    # histone mark data sets
	    #samtools view -b -f 0x2 $sorted_bam > $sorted_bam_PE
	    #$MACS -t $sorted_bam -f BAM -w -n $SRR_ID -S -g 1.35e+08 -m 3,50 --bdg --nomodel
	    bedtools genomecov -trackline -bg -ibam $sorted_bam -strand +|gzip -9 > $bedgraphs/$SRR_ID"_+_treat_afterfiting_all.bdg.gz"
	    bedtools genomecov -trackline -bg -ibam $sorted_bam -strand -|gzip -9 > $bedgraphs/$SRR_ID"_-_treat_afterfiting_all.bdg.gz"

	    bedtools genomecov -trackline -bg -ibam $sorted_bam -strand +|gzip -9 > $bedgraphs/$SRR_ID"_-+_treat_afterfiting_all.bdg.gz"
	    bedtools genomecov -trackline -bg -ibam $sorted_bam -strand -|gzip -9 >> $bedgraphs/$SRR_ID"_-+_treat_afterfiting_all.bdg.gz"
	else
	    touch $bedgraphs/$SRR_ID"_treat_afterfiting_all.bdg.gz"
	    #$MACS -t $sorted_bam -f BAM -w -n $SRR_ID -S -g 1.35e+08 -m 3,50 --bdg
	    bedtools genomecov -trackline -bg -ibam $sorted_bam -strand +|gzip -9 > $bedgraphs/$SRR_ID"_+_treat_afterfiting_all.bdg.gz"
	    bedtools genomecov -trackline -bg -ibam $sorted_bam -strand -|gzip -9 > $bedgraphs/$SRR_ID"_-_treat_afterfiting_all.bdg.gz"

	fi
	# cleanup
	mv $SRR_ID\_MACS_bedGraph/treat/$SRR_ID"_treat_afterfiting_all.bdg.gz" bedgraphs/
	rm -r $SRR_ID"_peaks"* $SRR_ID"_summits.bed" $SRR_ID"_model.r" $SRR_ID"_MACS_bedGraph"
    fi
done