parallel-corpus-preprocessor/preprocessor_script.py at master · moodser/parallel-corpus-preprocessor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Created on Tue Feb 09 2019

@script: Parallel Corpus Pre-processor

@author: Moodser Hussain
@contact: moodser.hussain@gmail.com
@org: COMSATS University Islamabad, Lahore Campus, Pakistan

"""

####################################
## Acknowledgement
#### Dr. Rao Muhammad Adeel Nawab (Supervisor)
#### Mr. Muhammad Sharjeel (Co-Supervisor)
####################################

"""
@functionality:
-> It remove and count blank lines from parallel corpus files.
-> It count number of short sentences (one world sentences) but don't remove them.
-> It remove and count long lines (length > 40) from parallel corpus files.
"""


# Required Libraries
import random, sys, os, glob, math

def getFileNames():
	# Get the name of files in a list
	directory = os.getcwd()
	files = sorted(os.listdir(directory+'/en'))
	i=0
	for thisFile in files:
		files[i] = os.path.splitext(thisFile)[0]
		i=i+1
	return files

if __name__ == '__main__':
	file_names = getFileNames()

	os.mkdir(os.getcwd()+'/en_new')
	os.mkdir(os.getcwd()+'/ur_new')

	# Generating a file to store number of empty and short sentences
	report = open('report.txt', 'a+', encoding="utf8")

	for filename in file_names:
		en = corpus_en = filename+'.en'
		ur = corpus_ur = filename+'.ur'

		# Initializing Variables
		line_number = single_word = empty_line = long_lines = 0
		line_del_list = []

		# Generating Temporary Files (for removing blank lines from English File)
		new_en = open('en_new/'+corpus_en+'.temp', 'a+', encoding="utf8")
		new_ur = open('ur_new/'+corpus_ur+'.temp', 'a+', encoding="utf8")

		with open('en/'+corpus_en, encoding="utf8") as file:
			for line in file:
				line_number=line_number+1
				if (len(str(line).split())>1 and len(str(line).split())<=41):
					new_en.writelines(line)
					if (len(str(line).split())==2):
						single_word=single_word+1
				elif (len(str(line).split())>41):
					line_del_list.append(line_number)
					long_lines=long_lines+1
				else:
					line_del_list.append(line_number)
					empty_line=empty_line+1

		# Saving Report for English File
		report.writelines('\n')
		report.writelines('File Name : '+en+'\n')
		report.writelines('Empty Lines : '+str(empty_line)+'\n')
		report.writelines('One Word Sentences : '+str(single_word)+'\n')
		report.writelines('Long Sentences (Removed) : '+str(long_lines)+'\n')
		report.writelines('\n')

		line_number=0
		# Removing Sentences from Urdu File (against English Blank Lines)
		with open('ur/'+corpus_ur, encoding="utf8") as file:
			for line in file:
				line_number=line_number+1
				if line_number not in line_del_list:
					new_ur.writelines(line)
		new_en.close()
		new_ur.close()

		# Considering temporary files as current files
		corpus_en = corpus_en+'.temp'
		corpus_ur = corpus_ur+'.temp'

		# Generating Final Files
		final_en = open('en_new/'+en, 'a+', encoding="utf8")
		final_ur = open('ur_new/'+ur, 'a+', encoding="utf8")
		# Initializing Variables
		line_number = single_word = empty_line = long_lines = 0
		line_del_list = []

		# Detecting Blank Lines and Short Sentences (from Urdu File)
		with open('ur_new/'+corpus_ur, encoding="utf8") as file:
			for line in file:
				line_number=line_number+1
				if (len(str(line).split())>1 and len(str(line).split())<=41):
					final_ur.writelines(line)
					if (len(str(line).split())==2):
						single_word=single_word+1
				elif (len(str(line).split())>41):
					line_del_list.append(line_number)
					long_lines=long_lines+1
				else:
					line_del_list.append(line_number)
					empty_line=empty_line+1

		# Saving Report for Urdu File
		report.writelines('File Name : '+ur+'\n')
		report.writelines('Empty Lines : '+str(empty_line)+'\n')
		report.writelines('One Word Sentences : '+str(single_word)+'\n')
		report.writelines('Long Sentences (Removed) : '+str(long_lines)+'\n')
		report.writelines('\n #################################')
		line_number=0

		# Removing Sentences from English File (against Urdu Blank Lines)
		with open('en_new/'+corpus_en, encoding="utf8") as file:
			for line in file:
				line_number=line_number+1
				if line_number not in line_del_list:
					final_en.writelines(line)


		#Closing Files
		final_en.close()
		final_ur.close()
		# Removing Temporary Files
		os.remove('en_new/'+corpus_en)
		os.remove('ur_new/'+corpus_ur)