-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathextract_stats_chi.py
More file actions
executable file
·124 lines (103 loc) · 4.18 KB
/
extract_stats_chi.py
File metadata and controls
executable file
·124 lines (103 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import sys
import argparse
import glob
import os
import re
from collections import OrderedDict
def detect_ad_chi_tt(previous_activity, curr_activity, last_silence_dur, chi, mal, fem):
if last_silence_dur <= 2.0:
if (previous_activity in mal and curr_activity in chi) or \
(previous_activity in fem and curr_activity in chi) or \
(previous_activity in chi and curr_activity in mal) or \
(previous_activity in chi and curr_activity in fem):
return 1
return 0
def compute_statistics(rttm_path, chi, mal, fem):
chi_dur=0.0
chi_utt=0
ad_dur=0.0
ad_utt=0
ad_chi_tt=0
silence_dur = 0
prev_activity = None
onset_prev = 0
dur_prev = 0
with open(rttm_path,'r') as rttm:
for line in rttm:
line = line.replace('\t', ' ')
line = re.sub('\s+', ' ', line).strip()
anno_fields = line.split(' ')
curr_activity = anno_fields[7]
if curr_activity != 'SIL' and curr_activity != 'S': # We're managing things as if 'SIL' lines weren't exist
onset = float(anno_fields[3])
dur = float(anno_fields[4])
if curr_activity in chi:
chi_dur += dur
chi_utt += 1
elif curr_activity in mal or curr_activity in fem:
ad_dur += dur
ad_utt +=1
else:
print("Activity %s not recognized" % (curr_activity))
sys.exit(1)
if onset_prev + dur_prev == onset:
silence_dur = 0.0
else:
silence_dur = onset - onset_prev - dur_prev
ad_chi_tt += detect_ad_chi_tt(prev_activity, curr_activity, silence_dur, chi, mal, fem)
# We're managing things as if SIL lines weren't exist
prev_activity=curr_activity
onset_prev=onset
dur_prev=dur
filename = os.path.basename(rttm_path).split('.')[0]
res = [filename, chi_dur, chi_utt, ad_dur, ad_utt, ad_chi_tt]
return res
def write_stats(list_stats, folder):
filename=os.path.join(folder,"stats.txt")
with open(filename,'w') as fn:
fn.write("filename\tchi_dur\tchi_utt\tad_dur\tad_utt\tad_chi_tt\n")
for stats in list_stats:
fn.write('\t'.join(map(str,stats))+'\n')
def main():
parser = argparse.ArgumentParser(description="convert .txt into .rttm")
parser.add_argument('-f', '--folder', type=str, required=True,
help="path to the folder where to find the rttm to analyze."
"Note that all of the rttm are scanned.")
parser.add_argument('--chi', nargs='+', type=str, required=True,
help="labels that need to be considered as being child vocalization.")
parser.add_argument('--mal', nargs='+', type=str, required=True,
help="labels that need to be considered as being male adult speech.")
parser.add_argument('--fem', nargs='+', type=str, required=True,
help="labels that need to be considered as being female adult speech.")
args = parser.parse_args()
# Below the values that need to be consider when evaluating tsi/lena folder
# prob C22_20170717_5640
## Gold files
# chi=['CHI*','C1', 'C2']
# mal=['MA1','MA2']
# fem=['MOT*','FA1','FA2']
## Yunitator
# chi = ['CHI']
# mal = ['MAL']
# fem = ['FEM']
## Lena N tag
# chi = ['CXN', 'CHN']
# mal = ['MAN']
# fem = ['FAN']
## Lena MFC
# chi = ['C']
# mal = ['M']
# fem = ['F']
## Lena N and F separated (should be the equivalent of MFC
# chi = ['CHN', 'CXN', 'CHF', 'CXF']
# mal = ['MAN', 'MAF']
# fem = ['FAN', 'FAF']
args.folder=os.path.join('/vagrant', args.folder)
rttm_files = [fn for fn in glob.iglob(os.path.join(args.folder, '*.rttm'))
if 'cutted' not in fn]
list_stats=[]
for rttm_path in rttm_files:
list_stats.append(compute_statistics(rttm_path, args.chi, args.mal, args.fem))
write_stats(list_stats, args.folder)
if __name__ == '__main__':
main()