CoursePlanner9000/courseProcessing.py at master · jamesHo22/CoursePlanner9000 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import pandas as pd
from pathlib import Path
import re
from datetime import datetime
import numpy as np
import User
# Read the data as a dataframe


# TODO:
def checkRequiredCourses(requiredCourses, currentCourses):
    '''
    This method will take in a list of required courses and current courses and return the courses you still need
    '''
    li_dif = [i for i in requiredCourses + currentCourses if i not in requiredCourses or i not in currentCourses]
    return li_dif


def getAllCourses():
    '''
    Reads from processed courses csv file and returns a pandas dataframe
    '''
    cwd = Path.cwd()
    filePath = Path(cwd / 'formattedCourses.csv')
    courseTable = pd.read_csv(filePath)
    return courseTable
#%%
def getCourseInfo(text):
    '''
    Parses the description the 'Faculty / Schedule' column of the csv from my.olin.edu add/drop
    and creates a dataframe with the correct columns
    text: String, the text you want to parse.
    '''
    # Import the table
    cwd = Path.cwd()
    filePath = Path(cwd / 'courseData/2019_S1_offering.csv')
    courseTable = pd.read_csv(filePath)
    # Split each line into an array
    eachLine = text.splitlines()
    # Iterate through each array and use RegEx to pick out the parts you need
    listOfCourses = list()
    for line in eachLine:

        # Parse the text to get usefull information

        instructorRegexPat = "(.*?)(?=\/)"

        instructorRegexPatAM = "(.*?) / ([a-zA-Z]+) (\d{2}:\d{2}[a-zA-Z]+)-(\d{2}:\d{2}[a-zA-Z]+); (.*)"
        instructorRegexPatPM = "(.*?) / ([a-zA-Z]+) (\d{2}:\d{2})-(\d{2}:\d{2}[a-zA-Z]+); (.*)"
        instructorRegexPatNoDays = "(.*?) / (\d{2}:\d{2})-(\d{2}:\d{2}[a-zA-Z]+); (.*)"
        matchedObject = re.match(instructorRegexPatAM, line)

        # This check here is intended to switch between AM and PM time patterns
        if matchedObject is None:
            matchedObject = re.match(instructorRegexPatPM, line)

        # handles weird case where there is a 00:00 time and no start date
        if "00:00-00:00AM;" in line:
            matchedObject = re.match(instructorRegexPatNoDays, line)
            instructor = matchedObject.group(1)
            day = None
            startTime = "01:00AM"
            endTime = "01:00AM"
            location = matchedObject.group(4)

        # Handles the normal cases
        else:
            instructor = matchedObject.group(1)
            day = matchedObject.group(2)
            startTime = matchedObject.group(3)
            endTime = matchedObject.group(4)
            location = matchedObject.group(5)

        # Add PM to the end of the startTime string so the format is the same for all time strings
        if "PM" in endTime and "AM" not in startTime:
            startTime = startTime + "PM"

        if "AM" in endTime and "AM" not in startTime:
            startTime = startTime + "AM"

        # Convert all the strings into the right data types
        startTime = getDateTime(startTime)
        endTime = getDateTime(endTime)
        daysOffered = getWeekdays(day)
        listOfCourses.append([instructor, daysOffered, startTime, endTime, location])
    return listOfCourses
#%%
def getDateTime(dateString):
    '''
    This function takes in the string that represents a time and returns a datetime object
    dateString: String, represents the time 00:00AM as an example
    '''
    dateObject = datetime.strptime(dateString, "%I:%M%p")
    # print(dateObject.time())
    return dateObject.time()
#%%
def getWeekdays(days):
    '''
    Given a string of characters, converts it into a boolean tuple with length of 7,
    each representing whether or not the course is offered on each day
    days: String, with characters in MTWRT representing when each class is offered in the week
    returns: Tuple, indexed monday to friday in order. 1 means offered, 0 means not
    '''
    if days is None:
        return None
    else:
        daysDict = {
                "M":0,
                "T":0,
                "W":0,
                "R":0,
                "F":0,
            }

        listOfDays = list(days)

        for day in listOfDays:
            daysDict[day] = 1

    return daysDict.get("M"), daysDict.get("T"), daysDict.get("W"), daysDict.get("R"), daysDict.get("F")

# Parse through the Facuilty/Schedule column and get all the info
#%%
def formatCourses():
    '''
    takes the course data from my.olin.edu and creates a table with the corrent columns
    '''
    cwd = Path.cwd()
    filePath = Path(cwd / 'courseData/2019_S1_offering.csv')
    courseTable = pd.read_csv(filePath)

    # Make new columns
    # instructor column
    counter = 0
    formattedColumns = ['Course Code', 'Name', 'Req', 'Credits','Begin Date', 'End Date', 'Instructor', 'M', 'T', 'W', 'R', 'F', 'start_time', 'end_time', 'location']
    formattedDataFrame = pd.DataFrame(columns=formattedColumns)
    descriptions = courseTable['Faculty / Schedule'].values
    for i in range(len(courseTable)):
        # Read all the values from the original dataframe
        courseCode = courseTable.iloc[i]['Course Code']
        name = courseTable.iloc[i]['Name']
        req = courseTable.iloc[i]['Req']
        numCredits = courseTable.iloc[i]['Credits']
        begin_date = datetime.strptime(courseTable.iloc[0]['Begin Date'], '%m/%d/%Y')
        end_date = datetime.strptime(courseTable.iloc[0]['End Date'], '%m/%d/%Y')

        listOfCourses = getCourseInfo(descriptions[i])
        for j in range(len(listOfCourses)):
            counter+=1
            instructor, days, startTime, endTime, location = listOfCourses[j]
            if days is not None:
                M,T,W,R,F = days
            else:
                M,T,W,R,F = 0,0,0,0,0
            newRow = pd.Series([courseCode, name, req, numCredits, begin_date, end_date, instructor, M, T, W, R, F, startTime, endTime, location], index=formattedColumns)
            print(newRow)
            formattedDataFrame.loc[counter] = newRow

    formattedDataFrame.to_csv("formattedCourses.csv")

# %%
def filterCourses(filters):
    '''
    This function will take in some query parameters and filter the formatted courses

    :param filters: a list containing the columns to perform an AND query on
    :returns: dictionary of all the filtered courses and their information
    '''
    if len(filters) > 0:
        allCoursesDF = getAllCourses()
        queryString = []
        finalString = ''
        if len(filters) > 1:
            for i,v in enumerate(filters):
                if i == len(filters)-1:
                    query = f"{str(v)} == True"
                    queryString.append(query)
                else:
                    query = f"{str(v)} == True and "
                    queryString.append(query)
        else:
            query = f"{str(filters[0])} == True"
            queryString.append(query)

        dfQuery = finalString.join(queryString)
        return allCoursesDF.query(dfQuery, inplace=False)
    else:
        return getAllCourses()

def getCourseById(id):
    '''returns the course with the specific row ID'''
    allcoursesDF = getAllCourses()
    return allcoursesDF.iloc[1]


# TODO:
# Make a SQL database with these columns
# ID (auto id),
# STUDENT_REVIEW_ID (LATER),
# COURSE_CODE (string),
# SUBJECT_CODE (string),
# START_TIME (datetime),
# END_TIME (datetime),
# DAY_SUN,
# DAY_MON,
# DAY_TUES,
# DAY_WED,
# DAY_THURS,
# DAY_FRI,
# DAY_SAT,
# LOCATION,
# COURSE_NAME,
# COURSE_DESCRIPTION,
# PREREQUISITES,
# CREDITS,
# INSTRUCTORS,
# SECTION,
# RECOMMENDED_PRERQUISITES,
# EST_HOURS,
# LAB (BOOLEAN),
# DATE_ADDED


# %%