-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcourseProcessing.py
More file actions
231 lines (196 loc) · 7.5 KB
/
courseProcessing.py
File metadata and controls
231 lines (196 loc) · 7.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import pandas as pd
from pathlib import Path
import re
from datetime import datetime
import numpy as np
import User
# Read the data as a dataframe
# TODO:
def checkRequiredCourses(requiredCourses, currentCourses):
'''
This method will take in a list of required courses and current courses and return the courses you still need
'''
li_dif = [i for i in requiredCourses + currentCourses if i not in requiredCourses or i not in currentCourses]
return li_dif
def getAllCourses():
'''
Reads from processed courses csv file and returns a pandas dataframe
'''
cwd = Path.cwd()
filePath = Path(cwd / 'formattedCourses.csv')
courseTable = pd.read_csv(filePath)
return courseTable
#%%
def getCourseInfo(text):
'''
Parses the description the 'Faculty / Schedule' column of the csv from my.olin.edu add/drop
and creates a dataframe with the correct columns
text: String, the text you want to parse.
'''
# Import the table
cwd = Path.cwd()
filePath = Path(cwd / 'courseData/2019_S1_offering.csv')
courseTable = pd.read_csv(filePath)
# Split each line into an array
eachLine = text.splitlines()
# Iterate through each array and use RegEx to pick out the parts you need
listOfCourses = list()
for line in eachLine:
# Parse the text to get usefull information
instructorRegexPat = "(.*?)(?=\/)"
instructorRegexPatAM = "(.*?) / ([a-zA-Z]+) (\d{2}:\d{2}[a-zA-Z]+)-(\d{2}:\d{2}[a-zA-Z]+); (.*)"
instructorRegexPatPM = "(.*?) / ([a-zA-Z]+) (\d{2}:\d{2})-(\d{2}:\d{2}[a-zA-Z]+); (.*)"
instructorRegexPatNoDays = "(.*?) / (\d{2}:\d{2})-(\d{2}:\d{2}[a-zA-Z]+); (.*)"
matchedObject = re.match(instructorRegexPatAM, line)
# This check here is intended to switch between AM and PM time patterns
if matchedObject is None:
matchedObject = re.match(instructorRegexPatPM, line)
# handles weird case where there is a 00:00 time and no start date
if "00:00-00:00AM;" in line:
matchedObject = re.match(instructorRegexPatNoDays, line)
instructor = matchedObject.group(1)
day = None
startTime = "01:00AM"
endTime = "01:00AM"
location = matchedObject.group(4)
# Handles the normal cases
else:
instructor = matchedObject.group(1)
day = matchedObject.group(2)
startTime = matchedObject.group(3)
endTime = matchedObject.group(4)
location = matchedObject.group(5)
# Add PM to the end of the startTime string so the format is the same for all time strings
if "PM" in endTime and "AM" not in startTime:
startTime = startTime + "PM"
if "AM" in endTime and "AM" not in startTime:
startTime = startTime + "AM"
# Convert all the strings into the right data types
startTime = getDateTime(startTime)
endTime = getDateTime(endTime)
daysOffered = getWeekdays(day)
listOfCourses.append([instructor, daysOffered, startTime, endTime, location])
return listOfCourses
#%%
def getDateTime(dateString):
'''
This function takes in the string that represents a time and returns a datetime object
dateString: String, represents the time 00:00AM as an example
'''
dateObject = datetime.strptime(dateString, "%I:%M%p")
# print(dateObject.time())
return dateObject.time()
#%%
def getWeekdays(days):
'''
Given a string of characters, converts it into a boolean tuple with length of 7,
each representing whether or not the course is offered on each day
days: String, with characters in MTWRT representing when each class is offered in the week
returns: Tuple, indexed monday to friday in order. 1 means offered, 0 means not
'''
if days is None:
return None
else:
daysDict = {
"M":0,
"T":0,
"W":0,
"R":0,
"F":0,
}
listOfDays = list(days)
for day in listOfDays:
daysDict[day] = 1
return daysDict.get("M"), daysDict.get("T"), daysDict.get("W"), daysDict.get("R"), daysDict.get("F")
# Parse through the Facuilty/Schedule column and get all the info
#%%
def formatCourses():
'''
takes the course data from my.olin.edu and creates a table with the corrent columns
'''
cwd = Path.cwd()
filePath = Path(cwd / 'courseData/2019_S1_offering.csv')
courseTable = pd.read_csv(filePath)
# Make new columns
# instructor column
counter = 0
formattedColumns = ['Course Code', 'Name', 'Req', 'Credits','Begin Date', 'End Date', 'Instructor', 'M', 'T', 'W', 'R', 'F', 'start_time', 'end_time', 'location']
formattedDataFrame = pd.DataFrame(columns=formattedColumns)
descriptions = courseTable['Faculty / Schedule'].values
for i in range(len(courseTable)):
# Read all the values from the original dataframe
courseCode = courseTable.iloc[i]['Course Code']
name = courseTable.iloc[i]['Name']
req = courseTable.iloc[i]['Req']
numCredits = courseTable.iloc[i]['Credits']
begin_date = datetime.strptime(courseTable.iloc[0]['Begin Date'], '%m/%d/%Y')
end_date = datetime.strptime(courseTable.iloc[0]['End Date'], '%m/%d/%Y')
listOfCourses = getCourseInfo(descriptions[i])
for j in range(len(listOfCourses)):
counter+=1
instructor, days, startTime, endTime, location = listOfCourses[j]
if days is not None:
M,T,W,R,F = days
else:
M,T,W,R,F = 0,0,0,0,0
newRow = pd.Series([courseCode, name, req, numCredits, begin_date, end_date, instructor, M, T, W, R, F, startTime, endTime, location], index=formattedColumns)
print(newRow)
formattedDataFrame.loc[counter] = newRow
formattedDataFrame.to_csv("formattedCourses.csv")
# %%
def filterCourses(filters):
'''
This function will take in some query parameters and filter the formatted courses
:param filters: a list containing the columns to perform an AND query on
:returns: dictionary of all the filtered courses and their information
'''
if len(filters) > 0:
allCoursesDF = getAllCourses()
queryString = []
finalString = ''
if len(filters) > 1:
for i,v in enumerate(filters):
if i == len(filters)-1:
query = f"{str(v)} == True"
queryString.append(query)
else:
query = f"{str(v)} == True and "
queryString.append(query)
else:
query = f"{str(filters[0])} == True"
queryString.append(query)
dfQuery = finalString.join(queryString)
return allCoursesDF.query(dfQuery, inplace=False)
else:
return getAllCourses()
def getCourseById(id):
'''returns the course with the specific row ID'''
allcoursesDF = getAllCourses()
return allcoursesDF.iloc[1]
# TODO:
# Make a SQL database with these columns
# ID (auto id),
# STUDENT_REVIEW_ID (LATER),
# COURSE_CODE (string),
# SUBJECT_CODE (string),
# START_TIME (datetime),
# END_TIME (datetime),
# DAY_SUN,
# DAY_MON,
# DAY_TUES,
# DAY_WED,
# DAY_THURS,
# DAY_FRI,
# DAY_SAT,
# LOCATION,
# COURSE_NAME,
# COURSE_DESCRIPTION,
# PREREQUISITES,
# CREDITS,
# INSTRUCTORS,
# SECTION,
# RECOMMENDED_PRERQUISITES,
# EST_HOURS,
# LAB (BOOLEAN),
# DATE_ADDED
# %%