-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfuzziness_final.py
More file actions
121 lines (114 loc) · 5.9 KB
/
fuzziness_final.py
File metadata and controls
121 lines (114 loc) · 5.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Script created by Dr Chris Green (christopher.green@arch.ox.ac.uk) as part of the EngLaID project: http://www.englaid.com
import csv # CSV library used by script
# Python script for processing percentage (as a decimal between 0 and 1) probabilities of records.
# Requires two input tables in CSV format:
# infile = this is the input data; it should contain a header row with four specified fields called:
# "id_field" - a field containing a unique ID code for each record
# "quantity" - a field containing the quantity of objects (e.g. finds) for each record
# "Min" - a field containing the minimum date for each record
# "Max" - a field containing the maximum date for each record
# datefile = this defines the time-slices to be processed; it should contain a header row with two specified fields called:
# "Min" - a field containing the minimum date for each time-slice
# "Max" - a field containing the maximum date for each time-slice
# Also:
# outfile = location of CSV file to be created with the output from the script
def process(infile, datefile, outfile):
f1 = open(infile, 'rU') # opens the input data file
f2 = open(datefile, 'rU') # opens the time-slice data file
f3 = open(outfile, 'wb') # creates the output data file
periods = csv.reader(f2) # Processing time-slice table
mins = [] # List to contain minimum values of time-slices
maxs = [] # List to contain maximum values of time-slices
min_pos = 0
max_pos = 0
rownum = 0
for row in periods:
if rownum == 0: # processing first row...
colnum = 0
for col in row:
if col == 'Min': # discovers column containing Min values - edit to change name
min_pos = colnum
if col == 'Max': # discovers column containing Max values - edit to change name
max_pos = colnum
colnum += 1
else: # processing all other rows...
mins.append(row[min_pos]) # appends minimum value to List
maxs.append(row[max_pos]) # appends maximum value to List
rownum +=1
f2.close() # closes time-slice table
writer = csv.writer(f3) # processing output table
reader = csv.reader(f1) # processing input table
rownum = 0
id_pos = 0
quant_pos = 0
min_pos = 0
max_pos = 0
for row in reader:
if rownum == 0: # processing first row...
colnum = 0
for col in row:
if col == 'id_field': # discovers column containing ID values - edit to change name
id_pos = colnum
if col == 'quantity': # discovers column containing quantity values - edit to change name
quant_pos = colnum
if col == 'Min': # discovers column containing Min values - edit to change name
min_pos = colnum
if col == 'Max': # discovers column containing Max values - edit to change name
max_pos = colnum
colnum += 1
outrow = ['id_field','quantity','Min','Max'] # creates output header row
inum = 0
for i in mins: # adds time-slice names to header row
bmin = str(i)
bmax = str(maxs[inum])
bracket = 'b'+bmin+'_'+bmax # time-slice name is 'b' + the min value + '_' + the max value
outrow.append(bracket)
inum += 1
writer.writerow(outrow) # writes header row to output file
else: # processing all other rows...
id_value = int(row[id_pos]) # obtains ID value
quant = int(row[quant_pos]) # obtains quantity value
try:
datemin = float(row[min_pos]) # obtains record minimum date value
datemax = float(row[max_pos]) # obtains record maximum date value
outrow = [id_value,quant,datemin,datemax] # creates output row
daterange = float(datemax - datemin) # calculates temporal range of record
inum = 0
for i in mins: # for each time-slice...
prob = -1.0 # defines initial probability as -1 (this will be output where a record contains broken dates)
bracketmin = float(i) # minimum value of time-slice
bracketmax = float(maxs[inum]) # maximum value of time-slice
bracketrange = float(bracketmax - bracketmin) # calculates temporal range of time-slice
if datemin == datemax: # if the time-slice is of 0 length
if datemin >= bracketmin and datemin <= bracketmax:
prob = 1.0 # probability = 1 where date range exactly matches time-slice range (both 0 length)
elif datemin < bracketmin:
prob = 0.0 # probability = 0 where date is earlier than time-slice
elif datemin > bracketmax:
prob = 0.0 # probability = 0 where date is later than time-slice
else:
prob = -1.0 # probability remains as -1 where the dates are broken
elif bracketmin <= datemin and bracketmax >= datemax:
prob = 1.0 # probability = 1 where date falls entirely within time-slice
elif bracketmin > datemax:
prob = 0.0 # probability = 0 where date is later than time-slice
elif bracketmax < datemin:
prob = 0.0 # probability = 0 where date is earlier than time-slice
elif bracketmin <= datemin and bracketmax < datemax:
prob = (bracketmax - datemin) / daterange # probability calculated as a % where date overlaps minimum of time-slice
elif bracketmin > datemin and bracketmax >= datemax:
prob = (datemax - bracketmin) / daterange # probability calculated as a % where date overlaps maximum of time-slice
elif bracketmin > datemin and bracketmax < datemax:
prob = bracketrange / daterange # probability calculated as a % where date overlaps all of time-slice
else:
prob = -1.0 # probability remains as -1 where the dates are broken
prob = round(prob, 5) # rounds the output probability to 5 decimal places
outrow.append(prob) # appends probability for time-slice to output row
inum += 1
writer.writerow(outrow) # writes output row to output file
except ValueError: # prints a message to screen for broken date entries
if str(row[min_pos]) != '' and str(row[max_pos]) != '':
print 'Bad dates: ' + str(id_value) + ' - ' + str(row[min_pos]) + ', ' + str(row[max_pos])
rownum += 1
f1.close() # closes input table
f3.close() # closes output table