-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor_script.py
More file actions
141 lines (115 loc) · 4.33 KB
/
preprocessor_script.py
File metadata and controls
141 lines (115 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Created on Tue Feb 09 2019
@script: Parallel Corpus Pre-processor
@author: Moodser Hussain
@contact: moodser.hussain@gmail.com
@org: COMSATS University Islamabad, Lahore Campus, Pakistan
"""
####################################
## Acknowledgement
#### Dr. Rao Muhammad Adeel Nawab (Supervisor)
#### Mr. Muhammad Sharjeel (Co-Supervisor)
####################################
"""
@functionality:
-> It remove and count blank lines from parallel corpus files.
-> It count number of short sentences (one world sentences) but don't remove them.
-> It remove and count long lines (length > 40) from parallel corpus files.
"""
# Required Libraries
import random, sys, os, glob, math
def getFileNames():
# Get the name of files in a list
directory = os.getcwd()
files = sorted(os.listdir(directory+'/en'))
i=0
for thisFile in files:
files[i] = os.path.splitext(thisFile)[0]
i=i+1
return files
if __name__ == '__main__':
file_names = getFileNames()
os.mkdir(os.getcwd()+'/en_new')
os.mkdir(os.getcwd()+'/ur_new')
# Generating a file to store number of empty and short sentences
report = open('report.txt', 'a+', encoding="utf8")
for filename in file_names:
en = corpus_en = filename+'.en'
ur = corpus_ur = filename+'.ur'
# Initializing Variables
line_number = single_word = empty_line = long_lines = 0
line_del_list = []
# Generating Temporary Files (for removing blank lines from English File)
new_en = open('en_new/'+corpus_en+'.temp', 'a+', encoding="utf8")
new_ur = open('ur_new/'+corpus_ur+'.temp', 'a+', encoding="utf8")
with open('en/'+corpus_en, encoding="utf8") as file:
for line in file:
line_number=line_number+1
if (len(str(line).split())>1 and len(str(line).split())<=41):
new_en.writelines(line)
if (len(str(line).split())==2):
single_word=single_word+1
elif (len(str(line).split())>41):
line_del_list.append(line_number)
long_lines=long_lines+1
else:
line_del_list.append(line_number)
empty_line=empty_line+1
# Saving Report for English File
report.writelines('\n')
report.writelines('File Name : '+en+'\n')
report.writelines('Empty Lines : '+str(empty_line)+'\n')
report.writelines('One Word Sentences : '+str(single_word)+'\n')
report.writelines('Long Sentences (Removed) : '+str(long_lines)+'\n')
report.writelines('\n')
line_number=0
# Removing Sentences from Urdu File (against English Blank Lines)
with open('ur/'+corpus_ur, encoding="utf8") as file:
for line in file:
line_number=line_number+1
if line_number not in line_del_list:
new_ur.writelines(line)
new_en.close()
new_ur.close()
# Considering temporary files as current files
corpus_en = corpus_en+'.temp'
corpus_ur = corpus_ur+'.temp'
# Generating Final Files
final_en = open('en_new/'+en, 'a+', encoding="utf8")
final_ur = open('ur_new/'+ur, 'a+', encoding="utf8")
# Initializing Variables
line_number = single_word = empty_line = long_lines = 0
line_del_list = []
# Detecting Blank Lines and Short Sentences (from Urdu File)
with open('ur_new/'+corpus_ur, encoding="utf8") as file:
for line in file:
line_number=line_number+1
if (len(str(line).split())>1 and len(str(line).split())<=41):
final_ur.writelines(line)
if (len(str(line).split())==2):
single_word=single_word+1
elif (len(str(line).split())>41):
line_del_list.append(line_number)
long_lines=long_lines+1
else:
line_del_list.append(line_number)
empty_line=empty_line+1
# Saving Report for Urdu File
report.writelines('File Name : '+ur+'\n')
report.writelines('Empty Lines : '+str(empty_line)+'\n')
report.writelines('One Word Sentences : '+str(single_word)+'\n')
report.writelines('Long Sentences (Removed) : '+str(long_lines)+'\n')
report.writelines('\n #################################')
line_number=0
# Removing Sentences from English File (against Urdu Blank Lines)
with open('en_new/'+corpus_en, encoding="utf8") as file:
for line in file:
line_number=line_number+1
if line_number not in line_del_list:
final_en.writelines(line)
#Closing Files
final_en.close()
final_ur.close()
# Removing Temporary Files
os.remove('en_new/'+corpus_en)
os.remove('ur_new/'+corpus_ur)